def test_model_loader_find_best(tmpdir): # empty directory assert ModelLoader.find_best(tmpdir, "test") is None # with no-monitor ckpts trainer = DummyTrainer( default_root_dir=tmpdir, callbacks=[ pl.callbacks.ModelCheckpoint(dirpath=tmpdir, save_top_k=-1, filename="{epoch}") ], checkpoint_callback=True, max_epochs=3, ) trainer.fit(DummyEngine(), datamodule=DummyMNIST()) assert ModelLoader.find_best(tmpdir, "test") is None # with monitor ckpts monitor = "bar" mc = pl.callbacks.ModelCheckpoint(dirpath=tmpdir, save_top_k=-1, monitor=monitor, mode="max", filename="{epoch}") trainer = DummyTrainer(default_root_dir=tmpdir, callbacks=[mc], checkpoint_callback=True, max_epochs=3) trainer.fit(DummyEngine(), datamodule=DummyMNIST()) assert (ModelLoader.find_best(tmpdir, monitor, mode="max") == tmpdir / "epoch=2-v0.ckpt" == mc.best_model_path) assert (ModelLoader.find_best(tmpdir, monitor, mode="min") == tmpdir / "epoch=0-v0.ckpt")
def test_model_loader_choose_by(tmpdir, input, expected): n_files = 15 for i in range(n_files): f = Path(tmpdir) / f"test-{i}.ckpt" torch.save(None, f) assert len(os.listdir(tmpdir)) == n_files assert ModelLoader.choose_by(f"{tmpdir}/{input}") == f"{tmpdir}/{expected}"
def test_model_loader_prepare_checkpoint(tmpdir): # create some checkpoints monitor = "bar" exp_dirpath = tmpdir / "experiment" trainer = DummyTrainer( default_root_dir=tmpdir, callbacks=[ pl.callbacks.ModelCheckpoint( dirpath=exp_dirpath, save_top_k=-1, monitor=monitor, mode="max", filename="{epoch}", ) ], checkpoint_callback=True, max_epochs=2, ) trainer.fit(DummyEngine(), datamodule=DummyMNIST()) expected = exp_dirpath / "epoch=0.ckpt" # nothing assert ModelLoader.prepare_checkpoint("", exp_dirpath, monitor) == expected # direct path assert ModelLoader.prepare_checkpoint(expected, exp_dirpath, monitor) == expected # direct path outside of exp_dirpath shutil.copy(expected, "/tmp") assert (ModelLoader.prepare_checkpoint("/tmp/epoch=0.ckpt", exp_dirpath, monitor) == "/tmp/epoch=0.ckpt") # filename assert (ModelLoader.prepare_checkpoint("epoch=0.ckpt", exp_dirpath, monitor) == expected) # globbed filename assert (ModelLoader.prepare_checkpoint( "epoch=?.ckpt", exp_dirpath, monitor) == exp_dirpath / "epoch=1.ckpt") # failures with pytest.raises(AssertionError, match="Could not find a valid checkpoint in"): ModelLoader.prepare_checkpoint("", tmpdir, monitor) with pytest.raises(AssertionError, match="Could not find the checkpoint"): ModelLoader.prepare_checkpoint("?", exp_dirpath, monitor)
def run( img_list: str, img_dirs: Optional[List[str]] = None, common: CommonArgs = CommonArgs(), data: DataArgs = DataArgs(), netout: NetoutArgs = NetoutArgs(), trainer: TrainerArgs = TrainerArgs(), ): loader = ModelLoader(common.train_path, filename=common.model_filename, device="cpu") checkpoint = loader.prepare_checkpoint(common.checkpoint, common.experiment_dirpath, common.monitor) model = loader.load_by(checkpoint) assert ( model is not None ), "Could not find the model. Have you run pylaia-htr-create-model?" # prepare the evaluator evaluator_module = EvaluatorModule( model, batch_input_fn=Compose([ItemFeeder("img"), ImageFeeder()]), batch_id_fn=ItemFeeder("id"), ) # prepare the data data_module = DataModule( img_dirs=img_dirs, te_img_list=img_list, batch_size=data.batch_size, color_mode=data.color_mode, stage="test", ) # prepare the kaldi writers writers = [] if netout.matrix is not None: writers.append( ArchiveMatrixWriter(join(common.experiment_dirpath, netout.matrix))) if netout.lattice is not None: writers.append( ArchiveLatticeWriter( join(common.experiment_dirpath, netout.lattice), digits=netout.digits, negate=True, )) assert ( writers ), "You did not specify any output file! Use the matrix/lattice arguments" # prepare the testing callbacks callbacks = [ Netout(writers, output_transform=netout.output_transform), ProgressBar(refresh_rate=trainer.progress_bar_refresh_rate), ] # prepare the trainer trainer = pl.Trainer( default_root_dir=common.train_path, callbacks=callbacks, logger=False, **vars(trainer), ) # run netout! trainer.test(evaluator_module, datamodule=data_module, verbose=False)
def run( syms: str, img_dirs: List[str], tr_txt_table: str, va_txt_table: str, common: CommonArgs = CommonArgs(), train: TrainArgs = TrainArgs(), optimizer: OptimizerArgs = OptimizerArgs(), scheduler: SchedulerArgs = SchedulerArgs(), data: DataArgs = DataArgs(), trainer: TrainerArgs = TrainerArgs(), ): pl.seed_everything(common.seed) loader = ModelLoader(common.train_path, filename=common.model_filename, device="cpu") # maybe load a checkpoint checkpoint = None if train.resume: checkpoint = loader.prepare_checkpoint(common.checkpoint, common.experiment_dirpath, common.monitor) trainer.max_epochs = torch.load(checkpoint)["epoch"] + train.resume log.info(f'Using checkpoint "{checkpoint}"') log.info(f"Max epochs set to {trainer.max_epochs}") # load the non-pytorch_lightning model model = loader.load() assert ( model is not None ), "Could not find the model. Have you run pylaia-htr-create-model?" # prepare the symbols syms = SymbolsTable(syms) for d in train.delimiters: assert d in syms, f'The delimiter "{d}" is not available in the symbols file' # prepare the engine engine_module = HTREngineModule( model, [syms[d] for d in train.delimiters], optimizer=optimizer, scheduler=scheduler, batch_input_fn=Compose([ItemFeeder("img"), ImageFeeder()]), batch_target_fn=ItemFeeder("txt"), batch_id_fn=ItemFeeder("id"), # Used to print image ids on exception ) # prepare the data data_module = DataModule( syms=syms, img_dirs=img_dirs, tr_txt_table=tr_txt_table, va_txt_table=va_txt_table, batch_size=data.batch_size, color_mode=data.color_mode, shuffle_tr=not bool(trainer.limit_train_batches), augment_tr=train.augment_training, stage="fit", ) # prepare the training callbacks # TODO: save on lowest_va_wer and every k epochs https://github.com/PyTorchLightning/pytorch-lightning/issues/2908 checkpoint_callback = pl.callbacks.ModelCheckpoint( dirpath=common.experiment_dirpath, filename="{epoch}-lowest_" + common.monitor, monitor=common.monitor, verbose=True, save_top_k=train.checkpoint_k, mode="min", save_last=True, ) checkpoint_callback.CHECKPOINT_NAME_LAST = "{epoch}-last" early_stopping_callback = pl.callbacks.EarlyStopping( monitor=common.monitor, patience=train.early_stopping_patience, verbose=True, mode="min", strict=False, # training_step may return None ) callbacks = [ ProgressBar(refresh_rate=trainer.progress_bar_refresh_rate), checkpoint_callback, early_stopping_callback, checkpoint_callback, ] if train.gpu_stats: callbacks.append(ProgressBarGPUStats()) if scheduler.active: callbacks.append(LearningRate(logging_interval="epoch")) # prepare the trainer trainer = pl.Trainer( default_root_dir=common.train_path, resume_from_checkpoint=checkpoint, callbacks=callbacks, logger=EpochCSVLogger(common.experiment_dirpath), checkpoint_callback=True, **vars(trainer), ) # train! trainer.fit(engine_module, datamodule=data_module) # training is over if early_stopping_callback.stopped_epoch: log.info( "Early stopping triggered after epoch" f" {early_stopping_callback.stopped_epoch + 1} (waited for" f" {early_stopping_callback.wait_count} epochs). The best score was" f" {early_stopping_callback.best_score}") log.info(f"Model has been trained for {trainer.current_epoch + 1} epochs" f" ({trainer.global_step + 1} steps)") log.info( f"Best {checkpoint_callback.monitor}={checkpoint_callback.best_model_score} " f"obtained with model={checkpoint_callback.best_model_path}")
type=str, default="experiment", choices=["experiment", "model"], help="Type of class which generated the checkpoint", ) add_argument("--save_dict_filename", type=str) # Loading of models and datasets args = args() syms = SymbolsTable(args.syms) device = torch.device("cuda:{}".format(args.gpu - 1) if args.gpu else "cpu") model = ModelLoader(args.train_path, filename=args.model_filename, device=device).load() if model is None: log.error("Could not find the model") exit(1) state = CheckpointLoader(device=device).load_by( os.path.join(args.train_path, args.checkpoint)) model.load_state_dict(state if args.source == "model" else Experiment.get_model_state_dict(state)) model = model.to(device) model.eval() dataset = TextImageFromTextTableDataset( args.txt_table, args.img_dirs, img_transform=ImageToTensor(),
def test_model_loader_choose_by_empty(tmpdir, input): assert ModelLoader.choose_by(f"{tmpdir}/{input}") is None
def test_model_loader_get_model_state_dict(tmpdir, input, expected): f = Path(tmpdir) / "checkpoint.ckpt" torch.save(input, f) loader = ModelLoader(tmpdir) state_dict = loader.get_model_state_dict(f) assert state_dict == expected
def test_model_loader_get_model_state_dict_raises(tmpdir, input, exception): f = Path(tmpdir) / "checkpoint.ckpt" torch.save(input, f) loader = ModelLoader(tmpdir) with pytest.raises(exception): loader.get_model_state_dict(f)
help='Resize images to this fixed height size') add_argument('syms', type=argparse.FileType('r'), help='Symbols table mapping from strings to integers') add_argument('img_dir', help='Directory containing word images') add_argument('tr_txt_table', type=argparse.FileType('r'), help='Character transcriptions of each training image') add_argument('va_txt_table', type=argparse.FileType('r'), help='Character transcriptions of each validation image') args = args() syms = SymbolsTable(args.syms) model = ModelLoader(args.train_path, gpu=args.gpu).load() if model is None: log.error('Could not find the model. Have you run ' '"pylaia-htr-create-model"?') exit(1) model = model.cuda(args.gpu - 1) if args.gpu else model.cpu() log.info('Model has {} parameters', sum(param.data.numel() for param in model.parameters())) trainer = TrainerLoader(args.train_path, gpu=args.gpu).load() if trainer is None: optimizer = SGD(model.parameters(), lr=args.learning_rate, momentum=args.momentum) parameters = { 'model': model,
def run( syms: str, img_list: str, img_dirs: Optional[List[str]] = None, common: CommonArgs = CommonArgs(), data: DataArgs = DataArgs(), decode: DecodeArgs = DecodeArgs(), trainer: TrainerArgs = TrainerArgs(), ): loader = ModelLoader(common.train_path, filename=common.model_filename, device="cpu") checkpoint = loader.prepare_checkpoint( common.checkpoint, common.experiment_dirpath, common.monitor, ) model = loader.load_by(checkpoint) assert ( model is not None ), "Could not find the model. Have you run pylaia-htr-create-model?" # prepare the evaluator evaluator_module = EvaluatorModule( model, batch_input_fn=Compose([ItemFeeder("img"), ImageFeeder()]), batch_id_fn=ItemFeeder("id"), ) # prepare the symbols syms = SymbolsTable(syms) # prepare the data data_module = DataModule( syms=syms, img_dirs=img_dirs, te_img_list=img_list, batch_size=data.batch_size, color_mode=data.color_mode, stage="test", ) # prepare the testing callbacks callbacks = [ ProgressBar(refresh_rate=trainer.progress_bar_refresh_rate), Segmentation( syms, segmentation=decode.segmentation, input_space=decode.input_space, separator=decode.separator, include_img_ids=decode.include_img_ids, ) if bool(decode.segmentation) else Decode( syms=syms, use_symbols=decode.use_symbols, input_space=decode.input_space, output_space=decode.output_space, convert_spaces=decode.convert_spaces, join_string=decode.join_string, separator=decode.separator, include_img_ids=decode.include_img_ids, ), ] # prepare the trainer trainer = pl.Trainer( default_root_dir=common.train_path, callbacks=callbacks, logger=False, **vars(trainer), ) # decode! trainer.test(evaluator_module, datamodule=data_module, verbose=False)