Пример #1
0
def default_trainer_params(with_validation=False, preload=True):
    p = CalamariTestScenario.default_trainer_params()
    train = Hdf5(
        files=[os.path.join(this_dir, "data", "uw3_50lines", "uw3-50lines.h5")],
        preload=preload,
    )
    if with_validation:
        p.gen.val = Hdf5(
            files=[os.path.join(this_dir, "data", "uw3_50lines", "uw3-50lines.h5")],
            preload=preload
        )
        p.gen.train = train
        p.gen.__post_init__()
    else:
        p.gen = CalamariTrainOnlyPipelineParams(train=train)

    p.gen.setup.val.batch_size = 1
    p.gen.setup.val.num_processes = 1
    p.gen.setup.train.batch_size = 1
    p.gen.setup.train.num_processes = 1
    p.epochs = 1
    p.samples_per_epoch = 2
    p.scenario.data.pre_proc = SequentialProcessorPipelineParams(
        run_parallel=False,
        processors=[PrepareSampleProcessorParams()],
    )
    p.scenario.data.__post_init__()
    p.scenario.__post_init__()
    p.__post_init__()
    return p
Пример #2
0
def uw3_trainer_params(with_validation=False,
                       with_split=False,
                       preload=True,
                       debug=False):
    p = CalamariTestScenario.default_trainer_params()
    p.scenario.debug_graph_construction = debug
    p.force_eager = debug

    train = FileDataParams(
        images=glob_all(
            [os.path.join(this_dir, "data", "uw3_50lines", "train", "*.png")]),
        preload=preload,
    )
    if with_split:
        p.gen = CalamariSplitTrainerPipelineParams(validation_split_ratio=0.2,
                                                   train=train)
    elif with_validation:
        p.gen.val.images = glob_all(
            [os.path.join(this_dir, "data", "uw3_50lines", "test", "*.png")])
        p.gen.val.preload = preload
        p.gen.train = train
        p.gen.__post_init__()
    else:
        p.gen = CalamariTrainOnlyPipelineParams(train=train)

    p.gen.setup.val.batch_size = 1
    p.gen.setup.val.num_processes = 1
    p.gen.setup.train.batch_size = 1
    p.gen.setup.train.num_processes = 1
    post_init(p)
    return p
Пример #3
0
def default_trainer_params(
    *,
    with_validation=False,
    with_split=False,
    preload=True,
    img_suffix="nrm.png",
    channels=1,
):
    p = CalamariTestScenario.default_trainer_params()
    train = PageXML(
        images=[
            os.path.join(this_dir, "data", "avicanon_pagexml",
                         f"006.{img_suffix}"),
            os.path.join(this_dir, "data", "avicanon_pagexml",
                         f"007.{img_suffix}"),
        ],
        preload=preload,
    )
    if with_split:
        p.gen = CalamariSplitTrainerPipelineParams(validation_split_ratio=0.5,
                                                   train=train)
    elif with_validation:
        p.gen.val = PageXML(
            images=[
                os.path.join(this_dir, "data", "avicanon_pagexml",
                             f"008.{img_suffix}")
            ],
            preload=preload,
        )
        p.gen.train = train
        p.gen.__post_init__()
    else:
        p.gen = CalamariTrainOnlyPipelineParams(train=train)

    p.gen.setup.val.batch_size = 1
    p.gen.setup.val.num_processes = 1
    p.gen.setup.train.batch_size = 1
    p.gen.setup.train.num_processes = 1
    p.epochs = 1
    p.samples_per_epoch = 2
    p.scenario.data.pre_proc.run_parallel = False
    p.scenario.data.input_channels = channels
    p.scenario.data.__post_init__()
    p.scenario.__post_init__()
    p.__post_init__()
    return p
Пример #4
0
def default_trainer_params(*,
                           with_validation=False,
                           with_split=False,
                           preload=True):
    p = CalamariTestScenario.default_trainer_params()
    train = Abbyy(
        images=[
            os.path.join(this_dir, "data",
                         "hiltl_die_bank_des_verderbens_abbyyxml", "*.jpg"),
        ],
        preload=preload,
    )
    if with_split:
        p.gen = CalamariSplitTrainerPipelineParams(validation_split_ratio=0.5,
                                                   train=train)
    elif with_validation:
        p.gen.val = Abbyy(
            images=[
                os.path.join(this_dir, "data",
                             "hiltl_die_bank_des_verderbens_abbyyxml", "*.jpg")
            ],
            preload=preload,
        )
        p.gen.train = train
        p.gen.__post_init__()
    else:
        p.gen = CalamariTrainOnlyPipelineParams(train=train)

    p.gen.setup.val.batch_size = 1
    p.gen.setup.val.num_processes = 1
    p.gen.setup.train.batch_size = 1
    p.gen.setup.train.num_processes = 1
    p.epochs = 1
    p.samples_per_epoch = 2
    p.scenario.data.pre_proc.run_parallel = False
    p.scenario.data.__post_init__()
    p.scenario.__post_init__()
    p.__post_init__()
    return p
Пример #5
0
    def train_books(
        self,
        books,
        cachefile=None,
        name="model",
        skip_commented=True,
        validation_split_ratio=1,
        bidi="",
        n_augmentations=0,
        ema_decay=0.0,
        train_verbose=1,
        debug=False,
        epochs=100,
        whitelist="",
        keep_loaded_codec=False,
        preload=True,
        weights=None,
        ensemble=0,
    ):

        keras.backend.clear_session()

        if isinstance(books, str):
            books = [books]
        if cachefile is None:
            cachefile = self.cachefile

        p = CalamariScenario.default_trainer_params()
        lids = list(
            lids_from_books(
                books, cachefile, complete_only=True, skip_commented=skip_commented
            )
        )
        train = Nsh5(cachefile=cachefile, lines=lids)

        newprcs = []
        for prc in p.scenario.data.pre_proc.processors:
            prc = deepcopy(prc)
            if PipelineMode.TRAINING in prc.modes:
                if isinstance(prc, FinalPreparationProcessorParams):
                    prc.normalize, prc.invert, prc.transpose = False, False, True
                elif isinstance(prc, AugmentationProcessorParams):
                    prc.n_augmentations = n_augmentations
                elif not isinstance(prc, PrepareSampleProcessorParams):
                    prc.modes -= (PipelineMode.TRAINING, PipelineMode.EVALUATION)
            newprcs.append(prc)
        p.scenario.data.pre_proc.processors = newprcs

        p.device.gpus = [n for n, _ in enumerate(list_physical_devices("GPU"))]

        if validation_split_ratio < 1:
            p.gen = CalamariSplitTrainerPipelineParams(
                validation_split_ratio=validation_split_ratio, train=train
            )
        else:
            p.gen = CalamariTrainOnlyPipelineParams(train=train)

        if bidi:
            for prc in p.scenario.data.post_proc.processors_of_type(
                BidiTextProcessorParams
            ):
                prc.bidi_direction = BidiDirection.RTL

        p.epochs = epochs
        p.codec.keep_loaded = keep_loaded_codec
        p.gen.train.preload = preload
        p.warmstart.model = weights
        p.scenario.model.ensemble = ensemble
        p.ema_decay = ema_decay

        p.scenario.data.__post_init__()
        p.scenario.__post_init__()
        p.__post_init__()

        p.output_dir = name
        trainer = p.scenario.cls().create_trainer(p)
        return trainer.train()