def uw3_trainer_params(with_validation=False, with_split=False, preload=True, debug=False): p = CalamariTestScenario.default_trainer_params() p.scenario.debug_graph_construction = debug p.force_eager = debug train = FileDataParams( images=glob_all( [os.path.join(this_dir, "data", "uw3_50lines", "train", "*.png")]), preload=preload, ) if with_split: p.gen = CalamariSplitTrainerPipelineParams(validation_split_ratio=0.2, train=train) elif with_validation: p.gen.val.images = glob_all( [os.path.join(this_dir, "data", "uw3_50lines", "test", "*.png")]) p.gen.val.preload = preload p.gen.train = train p.gen.__post_init__() else: p.gen = CalamariTrainOnlyPipelineParams(train=train) p.gen.setup.val.batch_size = 1 p.gen.setup.val.num_processes = 1 p.gen.setup.train.batch_size = 1 p.gen.setup.train.num_processes = 1 post_init(p) return p
def default_trainer_params( *, with_validation=False, with_split=False, preload=True, img_suffix="nrm.png", channels=1, ): p = CalamariTestScenario.default_trainer_params() train = PageXML( images=[ os.path.join(this_dir, "data", "avicanon_pagexml", f"006.{img_suffix}"), os.path.join(this_dir, "data", "avicanon_pagexml", f"007.{img_suffix}"), ], preload=preload, ) if with_split: p.gen = CalamariSplitTrainerPipelineParams(validation_split_ratio=0.5, train=train) elif with_validation: p.gen.val = PageXML( images=[ os.path.join(this_dir, "data", "avicanon_pagexml", f"008.{img_suffix}") ], preload=preload, ) p.gen.train = train p.gen.__post_init__() else: p.gen = CalamariTrainOnlyPipelineParams(train=train) p.gen.setup.val.batch_size = 1 p.gen.setup.val.num_processes = 1 p.gen.setup.train.batch_size = 1 p.gen.setup.train.num_processes = 1 p.epochs = 1 p.samples_per_epoch = 2 p.scenario.data.pre_proc.run_parallel = False p.scenario.data.input_channels = channels p.scenario.data.__post_init__() p.scenario.__post_init__() p.__post_init__() return p
def default_trainer_params(*, with_validation=False, with_split=False, preload=True): p = CalamariTestScenario.default_trainer_params() train = Abbyy( images=[ os.path.join(this_dir, "data", "hiltl_die_bank_des_verderbens_abbyyxml", "*.jpg"), ], preload=preload, ) if with_split: p.gen = CalamariSplitTrainerPipelineParams(validation_split_ratio=0.5, train=train) elif with_validation: p.gen.val = Abbyy( images=[ os.path.join(this_dir, "data", "hiltl_die_bank_des_verderbens_abbyyxml", "*.jpg") ], preload=preload, ) p.gen.train = train p.gen.__post_init__() else: p.gen = CalamariTrainOnlyPipelineParams(train=train) p.gen.setup.val.batch_size = 1 p.gen.setup.val.num_processes = 1 p.gen.setup.train.batch_size = 1 p.gen.setup.train.num_processes = 1 p.epochs = 1 p.samples_per_epoch = 2 p.scenario.data.pre_proc.run_parallel = False p.scenario.data.__post_init__() p.scenario.__post_init__() p.__post_init__() return p
def train_books( self, books, cachefile=None, name="model", skip_commented=True, validation_split_ratio=1, bidi="", n_augmentations=0, ema_decay=0.0, train_verbose=1, debug=False, epochs=100, whitelist="", keep_loaded_codec=False, preload=True, weights=None, ensemble=0, ): keras.backend.clear_session() if isinstance(books, str): books = [books] if cachefile is None: cachefile = self.cachefile p = CalamariScenario.default_trainer_params() lids = list( lids_from_books( books, cachefile, complete_only=True, skip_commented=skip_commented ) ) train = Nsh5(cachefile=cachefile, lines=lids) newprcs = [] for prc in p.scenario.data.pre_proc.processors: prc = deepcopy(prc) if PipelineMode.TRAINING in prc.modes: if isinstance(prc, FinalPreparationProcessorParams): prc.normalize, prc.invert, prc.transpose = False, False, True elif isinstance(prc, AugmentationProcessorParams): prc.n_augmentations = n_augmentations elif not isinstance(prc, PrepareSampleProcessorParams): prc.modes -= (PipelineMode.TRAINING, PipelineMode.EVALUATION) newprcs.append(prc) p.scenario.data.pre_proc.processors = newprcs p.device.gpus = [n for n, _ in enumerate(list_physical_devices("GPU"))] if validation_split_ratio < 1: p.gen = CalamariSplitTrainerPipelineParams( validation_split_ratio=validation_split_ratio, train=train ) else: p.gen = CalamariTrainOnlyPipelineParams(train=train) if bidi: for prc in p.scenario.data.post_proc.processors_of_type( BidiTextProcessorParams ): prc.bidi_direction = BidiDirection.RTL p.epochs = epochs p.codec.keep_loaded = keep_loaded_codec p.gen.train.preload = preload p.warmstart.model = weights p.scenario.model.ensemble = ensemble p.ema_decay = ema_decay p.scenario.data.__post_init__() p.scenario.__post_init__() p.__post_init__() p.output_dir = name trainer = p.scenario.cls().create_trainer(p) return trainer.train()