def train_model(params: Dict[str, Any]) -> float: np.random.seed(0) model_dir = opt.EXPERIMENT_DIR logger.info('=' * 50) logger.info(f'hyperparameters: {params}') train_loader, val_loader, test_loader = load_data(args.fold, params) model = create_model(args.predict, float(params['dropout'])) # freeze_layers(model) # if torch.cuda.device_count() == 1: # torchsummary.summary(model, (3, 224, 224)) if opt.TRAIN.OPTIMIZER == 'Adam': optimizer = optim.Adam(model.parameters(), opt.TRAIN.LEARNING_RATE) elif opt.TRAIN.OPTIMIZER == 'SGD': optimizer = optim.SGD(model.parameters(), opt.TRAIN.LEARNING_RATE, momentum=0.9, nesterov=True) else: assert False if opt.TRAIN.COSINE.ENABLE: set_lr(optimizer, opt.TRAIN.COSINE.LR) lr_scheduler = CosineLRWithRestarts( optimizer, opt.TRAIN.BATCH_SIZE, opt.TRAIN.BATCH_SIZE * opt.TRAIN.STEPS_PER_EPOCH, restart_period=opt.TRAIN.COSINE.PERIOD, t_mult=opt.TRAIN.COSINE.COEFF) else: lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau( optimizer, mode='max', patience=opt.TRAIN.PATIENCE, factor=opt.TRAIN.LR_REDUCE_FACTOR, verbose=True, min_lr=opt.TRAIN.MIN_LR, threshold=opt.TRAIN.MIN_IMPROVEMENT, threshold_mode='abs') if args.weights is None: last_epoch = 0 logger.info(f'training will start from epoch {last_epoch+1}') else: last_checkpoint = torch.load(args.weights) assert (last_checkpoint['arch'] == opt.MODEL.ARCH) model.load_state_dict(last_checkpoint['state_dict']) optimizer.load_state_dict(last_checkpoint['optimizer']) logger.info(f'checkpoint {args.weights} was loaded.') last_epoch = last_checkpoint['epoch'] logger.info(f'loaded the model from epoch {last_epoch}') set_lr(optimizer, opt.TRAIN.LEARNING_RATE) if args.predict: print('inference mode') generate_submission(val_loader, test_loader, model, last_epoch, args.weights) sys.exit(0) if opt.TRAIN.LOSS == 'BCE': criterion = nn.BCEWithLogitsLoss() else: raise RuntimeError('unknown loss specified') best_score = 0.0 best_epoch = 0 last_lr = read_lr(optimizer) best_model_path = None for epoch in range(last_epoch + 1, opt.TRAIN.EPOCHS + 1): logger.info('-' * 50) if not opt.TRAIN.COSINE.ENABLE: lr = read_lr(optimizer) if lr < last_lr - 1e-10 and best_model_path is not None: # reload the best model last_checkpoint = torch.load( os.path.join(model_dir, best_model_path)) assert (last_checkpoint['arch'] == opt.MODEL.ARCH) model.load_state_dict(last_checkpoint['state_dict']) optimizer.load_state_dict(last_checkpoint['optimizer']) logger.info(f'checkpoint {best_model_path} was loaded.') set_lr(optimizer, lr) last_lr = lr if lr < opt.TRAIN.MIN_LR * 1.01: logger.info('reached minimum LR, stopping') break # logger.info(f'lr={lr}, start cosine annealing!') # set_lr(optimizer, opt.TRAIN.COSINE.LR) # opt.TRAIN.COSINE.ENABLE = True # # lr_scheduler = CosineLRWithRestarts(optimizer, opt.TRAIN.BATCH_SIZE, # opt.TRAIN.BATCH_SIZE * opt.TRAIN.STEPS_PER_EPOCH, # restart_period=opt.TRAIN.COSINE.PERIOD, t_mult=opt.TRAIN.COSINE.COEFF) if opt.TRAIN.COSINE.ENABLE: lr_scheduler.step() read_lr(optimizer) train(train_loader, model, criterion, optimizer, epoch, lr_scheduler) score, _ = validate(val_loader, model, epoch) if not opt.TRAIN.COSINE.ENABLE: lr_scheduler.step(score) # type: ignore is_best = score > best_score best_score = max(score, best_score) if is_best: best_epoch = epoch data_to_save = { 'epoch': epoch, 'arch': opt.MODEL.ARCH, 'state_dict': model.state_dict(), 'best_score': best_score, 'score': score, 'optimizer': optimizer.state_dict(), 'options': opt } filename = opt.MODEL.VERSION if is_best: best_model_path = f'{filename}_f{args.fold}_e{epoch:02d}_{score:.04f}.pth' save_checkpoint(data_to_save, best_model_path, model_dir) logger.info(f'best score: {best_score:.04f}') return -best_score
if opt.TRAIN.OPTIMIZER == 'Adam': optimizer = optim.Adam(model.parameters(), opt.TRAIN.LEARNING_RATE) elif opt.TRAIN.OPTIMIZER == 'SGD': optimizer = optim.SGD(model.parameters(), opt.TRAIN.LEARNING_RATE, momentum=0.9, nesterov=True) else: assert False if opt.TRAIN.COSINE.ENABLE: set_lr(optimizer, opt.TRAIN.COSINE.LR) lr_scheduler = CosineLRWithRestarts( optimizer, opt.TRAIN.BATCH_SIZE, opt.TRAIN.BATCH_SIZE * opt.TRAIN.STEPS_PER_EPOCH, restart_period=opt.TRAIN.COSINE.PERIOD, t_mult=opt.TRAIN.COSINE.COEFF) else: lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau( optimizer, mode='max', patience=opt.TRAIN.PATIENCE, factor=opt.TRAIN.LR_REDUCE_FACTOR, verbose=True, min_lr=opt.TRAIN.MIN_LR, threshold=opt.TRAIN.MIN_IMPROVEMENT, threshold_mode='abs') if args.pretrained is None: last_epoch = 0
Path(opt.out_path).mkdir(parents=True, exist_ok=True) train_set = HalfHalfDataset(opt.real_path, opt.syn_path, opt.params_path, opt.blend, opt.channels, opt.split) train_loader = DataLoader(dataset=train_set, num_workers=opt.threads, batch_size=opt.batch_size, shuffle=True, pin_memory=True) val_set = RealDataset(opt.real_path, opt.channels, split='val') val_loader = DataLoader(dataset=val_set, num_workers=0, batch_size=1, shuffle=False) test_set = RealDataset(opt.real_path, opt.channels, split='test') test_loader = DataLoader(dataset=test_set, num_workers=0, batch_size=1, shuffle=False) opt.n_classes = train_set.n_classes net = PowderNet(opt.arch, opt.n_channels, train_set.n_classes) net = net.cuda() optimizer = AdamW([{'params': get_1x_lr_params(net)}, {'params': get_10x_lr_params(net), 'lr': opt.lr * 10}], lr=opt.lr, weight_decay=opt.decay) scheduler = CosineLRWithRestarts(optimizer, opt.batch_size, len(train_set), opt.period, opt.t_mult) vis = Visualizer(server=opt.server, env=opt.env) start_epoch = 0 if opt.resume is not None: checkpoint = torch.load(opt.resume) old_opt = checkpoint['opt'] assert(old_opt.channels == opt.channels) assert(old_opt.bands == opt.bands) assert(old_opt.arch == opt.arch) assert(old_opt.blend == opt.blend) assert(old_opt.lr == opt.lr) assert(old_opt.decay == opt.decay) assert(old_opt.period == opt.period) assert(old_opt.t_mult == opt.t_mult) net.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer'])
def run() -> float: np.random.seed(0) model_dir = config.experiment_dir logger.info('=' * 50) train_loader, val_loader, test_loader = load_data(args.fold) logger.info(f'creating a model {config.model.arch}') model = create_model(config, pretrained=args.weights is None).cuda() criterion = get_loss(config) if args.summary: torchsummary.summary(model, (3, config.model.input_size, config.model.input_size)) if args.lr_finder: optimizer = get_optimizer(config, model.parameters()) lr_finder(train_loader, model, criterion, optimizer) sys.exit() if args.weights is None and config.train.head_only_warmup: logger.info('-' * 50) logger.info(f'doing warmup for {config.train.warmup.steps} steps') logger.info(f'max_lr will be {config.train.warmup.max_lr}') optimizer = get_optimizer(config, model.parameters()) warmup_scheduler = get_warmup_scheduler(config, optimizer) freeze_layers(model) train_epoch(train_loader, model, criterion, optimizer, 0, warmup_scheduler, None, config.train.warmup.steps) unfreeze_layers(model) if args.weights is None and config.train.enable_warmup: logger.info('-' * 50) logger.info(f'doing warmup for {config.train.warmup.steps} steps') logger.info(f'max_lr will be {config.train.warmup.max_lr}') optimizer = get_optimizer(config, model.parameters()) warmup_scheduler = get_warmup_scheduler(config, optimizer) train_epoch(train_loader, model, criterion, optimizer, 0, warmup_scheduler, None, config.train.warmup.steps) optimizer = get_optimizer(config, model.parameters()) if args.weights is None: last_epoch = -1 else: last_checkpoint = torch.load(args.weights) model_arch = last_checkpoint['arch'].replace('se_', 'se') if model_arch != config.model.arch: dprint(model_arch) dprint(config.model.arch) assert model_arch == config.model.arch model.load_state_dict(last_checkpoint['state_dict']) if 'optimizer' in last_checkpoint.keys(): optimizer.load_state_dict(last_checkpoint['optimizer']) logger.info(f'checkpoint loaded: {args.weights}') last_epoch = last_checkpoint['epoch'] if 'epoch' in last_checkpoint.keys() else 99 logger.info(f'loaded the model from epoch {last_epoch}') if args.lr != 0: set_lr(optimizer, float(args.lr)) elif 'lr' in config.optimizer.params: set_lr(optimizer, config.optimizer.params.lr) elif 'base_lr' in config.scheduler.params: set_lr(optimizer, config.scheduler.params.base_lr) if not args.cosine: lr_scheduler = get_scheduler(config.scheduler, optimizer, last_epoch= (last_epoch if config.scheduler.name != 'cyclic_lr' else -1)) assert config.scheduler2.name == '' lr_scheduler2 = get_scheduler(config.scheduler2, optimizer, last_epoch=last_epoch) \ if config.scheduler2.name else None else: epoch_size = min(len(train_loader), config.train.max_steps_per_epoch) \ * config.train.batch_size set_lr(optimizer, float(config.cosine.start_lr)) lr_scheduler = CosineLRWithRestarts(optimizer, batch_size=config.train.batch_size, epoch_size=epoch_size, restart_period=config.cosine.period, period_inc=config.cosine.period_inc, max_period=config.cosine.max_period) lr_scheduler2 = None if args.predict_oof or args.predict_test: print('inference mode') assert args.weights is not None if args.predict_oof: gen_train_prediction(val_loader, model, last_epoch, args.weights) else: gen_test_prediction(test_loader, model, args.weights) sys.exit() logger.info(f'training will start from epoch {last_epoch + 1}') best_score = 0.0 best_epoch = 0 last_lr = get_lr(optimizer) best_model_path = args.weights for epoch in range(last_epoch + 1, config.train.num_epochs): logger.info('-' * 50) if not is_scheduler_continuous(lr_scheduler) and lr_scheduler2 is None: # if we have just reduced LR, reload the best saved model lr = get_lr(optimizer) if lr < last_lr - 1e-10 and best_model_path is not None: logger.info(f'learning rate dropped: {lr}, reloading') last_checkpoint = torch.load(best_model_path) assert(last_checkpoint['arch']==config.model.arch) model.load_state_dict(last_checkpoint['state_dict']) optimizer.load_state_dict(last_checkpoint['optimizer']) logger.info(f'checkpoint loaded: {best_model_path}') set_lr(optimizer, lr) last_lr = lr if config.train.lr_decay_coeff != 0 and epoch in config.train.lr_decay_milestones: n_cycles = config.train.lr_decay_milestones.index(epoch) + 1 total_coeff = config.train.lr_decay_coeff ** n_cycles logger.info(f'artificial LR scheduler: made {n_cycles} cycles, decreasing LR by {total_coeff}') set_lr(optimizer, config.scheduler.params.base_lr * total_coeff) lr_scheduler = get_scheduler(config.scheduler, optimizer, coeff=total_coeff, last_epoch=-1) # (last_epoch if config.scheduler.name != 'cyclic_lr' else -1)) if isinstance(lr_scheduler, CosineLRWithRestarts): restart = lr_scheduler.epoch_step() if restart: logger.info('cosine annealing restarted, resetting the best metric') best_score = min(config.cosine.min_metric_val, best_score) train_epoch(train_loader, model, criterion, optimizer, epoch, lr_scheduler, lr_scheduler2, config.train.max_steps_per_epoch) score, _, _ = validate(val_loader, model, epoch) if type(lr_scheduler) == ReduceLROnPlateau: lr_scheduler.step(metrics=score) elif not is_scheduler_continuous(lr_scheduler): lr_scheduler.step() if type(lr_scheduler2) == ReduceLROnPlateau: lr_scheduler2.step(metrics=score) elif lr_scheduler2 and not is_scheduler_continuous(lr_scheduler2): lr_scheduler2.step() is_best = score > best_score best_score = max(score, best_score) if is_best: best_epoch = epoch if is_best: best_model_path = os.path.join(model_dir, f'{config.version}_f{args.fold}_e{epoch:02d}_{score:.04f}.pth') data_to_save = { 'epoch': epoch, 'arch': config.model.arch, 'state_dict': model.state_dict(), 'score': score, 'optimizer': optimizer.state_dict(), 'config': config } torch.save(data_to_save, best_model_path) logger.info(f'a snapshot was saved to {best_model_path}') logger.info(f'best score: {best_score:.04f}') return -best_score
pretrained='imagenet') assert (opt.MODEL.INPUT_SIZE % 32 == 0) model.avgpool = nn.AvgPool2d(opt.MODEL.INPUT_SIZE // 32, stride=1) model.last_linear = nn.Linear(model.last_linear.in_features, DATA_INFO.NUM_CLASSES) model = torch.nn.DataParallel(model).cuda() if torch.cuda.device_count() == 1: torchsummary.summary(model, (3, opt.MODEL.INPUT_SIZE, opt.MODEL.INPUT_SIZE)) optimizer = optim.Adam(model.module.parameters(), opt.TRAIN.LEARNING_RATE) lr_scheduler = CosineLRWithRestarts(optimizer, opt.TRAIN.BATCH_SIZE, opt.TRAIN.BATCH_SIZE * opt.TRAIN.STEPS_PER_EPOCH, restart_period=50, t_mult=1.2) if opt.TRAIN.RESUME is None: last_epoch = 0 logger.info(f"Training will start from epoch {last_epoch+1}") else: last_checkpoint = torch.load(opt.TRAIN.RESUME) assert (last_checkpoint['arch'] == opt.MODEL.ARCH) model.module.load_state_dict(last_checkpoint['state_dict']) optimizer.load_state_dict(last_checkpoint['optimizer']) logger.info(f"Checkpoint {opt.TRAIN.RESUME} was loaded.") last_epoch = last_checkpoint['epoch']