Exemplo n.º 1
0
    def test_data_generator(self):
        numbers = list(range(10))
        data_params = DataParams(
            pre_proc=ComposedProcessorPipelineParams(
                pipelines=[
                    SequentialProcessorPipelineParams(
                        max_tasks_per_process=10000,
                        run_parallel=True,
                        num_threads=1,
                        processors=[
                            RepeatSampleProcessorParams(f=100, add_per_step=7),
                        ],
                    )
                ]
            )
        )
        data = data_params.create()
        with data.create_pipeline(
            DataPipelineParams(mode=PipelineMode.TRAINING), SimpleDataGeneratorParams(numbers_to_generate=numbers)
        ).generate_input_samples(auto_repeat=False) as samples:
            gen = zip(range(10), samples)
            out = [s.inputs for _, s in gen]

        with data.create_pipeline(
            DataPipelineParams(mode=PipelineMode.TRAINING), SimpleDataGeneratorParams(numbers_to_generate=numbers)
        ).generate_input_samples(auto_repeat=False) as samples:
            out = [s.inputs for s in samples]
Exemplo n.º 2
0
 def default_trainer_params(cls):
     p = super().default_trainer_params()
     p.gen.setup.train = DataPipelineParams(batch_size=1)
     p.gen.setup.val = DataPipelineParams(limit=1, batch_size=1)
     p.gen.__post_init__()
     p.gen.train_val.dataset = "fashion_mnist"
     p.skip_model_load_test = True
     p.random_seed = 1337
     p.force_eager = False
     p.epochs = 1
     p.samples_per_epoch = 1
     return p
Exemplo n.º 3
0
 def default_trainer_params(cls):
     p = super().default_trainer_params()
     p.gen.setup.train = DataPipelineParams(limit=10, batch_size=1)
     p.gen.setup.val = DataPipelineParams(limit=10, batch_size=1)
     p.gen.__post_init__()
     p.gen.train_val.dataset = "fashion_mnist"
     p.gen.train_val.force_train = True  # Use always training data ...
     p.gen.train_val.shuffle = False  # ... and dont shuffle the training data
     p.skip_model_load_test = True
     p.random_seed = 1337
     p.force_eager = False
     p.epochs = 5
     p.samples_per_epoch = 10
     return p
Exemplo n.º 4
0
class TrainerPipelines:
    train: DataPipelineParams = field(
        default_factory=lambda: DataPipelineParams(mode=PipelineMode.TRAINING),
        metadata=pai_meta(fix_dc=True, mode="flat"),
    )
    val: DataPipelineParams = field(
        default_factory=lambda: DataPipelineParams(mode=PipelineMode.EVALUATION
                                                   ),
        metadata=pai_meta(fix_dc=True, mode="flat"),
    )

    def __post_init__(self):
        self.train.mode = PipelineMode.TRAINING
        self.val.mode = PipelineMode.EVALUATION
Exemplo n.º 5
0
 def test_model_zoo(self):
     version = '1.0'
     url = f"https://github.com/Calamari-OCR/calamari_models/archive/{version}.tar.gz"
     with tempfile.TemporaryDirectory() as d:
         d = 'model_archive_permanent'  # for debugging
         os.makedirs(d, exist_ok=True)
         os.chdir(d)
         if not os.path.exists('calamari_models'):
             check_call([
                 'sh', '-c', ' '.join([
                     'wget', '-q', '-O', '-', url, '|', 'tar', 'xz', '&&',
                     'mv', f'calamari_models-{version}', 'calamari_models'
                 ])
             ])
         trainer_params = uw3_trainer_params(with_validation=True)
         args = PredictAndEvalArgs(
             checkpoint=glob(
                 os.path.join('calamari_models', 'antiqua_modern',
                              '*.ckpt.json')),
             predictor=PredictorParams(pipeline=DataPipelineParams(
                 batch_size=5)),
             data=trainer_params.gen.val_gen(),
         )
         full_evaluation = predict_and_eval_main(args)
         self.assertLess(
             full_evaluation['voted']['eval']['avg_ler'], 0.001,
             "The accuracy on the test data must be below 0.1%")
Exemplo n.º 6
0
    def test_data_sequential_pipeline(self):
        numbers = list(range(10))
        target_numbers = []
        for n in numbers:
            target_numbers.append((n + 1 + 3))
            target_numbers.append((n + 1 + 3 + 7))
        target_numbers = [n * 3 + 1 for n in target_numbers]
        target_numbers = [n for n in target_numbers if n % 2 == 1]

        data_params = DataParams(
            pre_proc=SequentialProcessorPipelineParams(
                run_parallel=False,
                num_threads=3,
                processors=[
                    AddProcessorParams(v=1),
                    AddProcessorParams(v=3),
                    RepeatSampleProcessorParams(f=2, add_per_step=7),
                    MultiplyProcessorParams(f=3),
                    AddProcessorParams(v=1),
                    DropIfEvenProcessorParams(),
                ],
            )
        )
        data = data_params.create()
        with data.create_pipeline(
            DataPipelineParams(mode=PipelineMode.TRAINING), SimpleDataGeneratorParams(numbers_to_generate=numbers)
        ).generate_input_samples(auto_repeat=False) as samples:
            out = [s.inputs for s in samples]
            self.assertListEqual(target_numbers, out)
Exemplo n.º 7
0
    def __init__(self, settings: AlgorithmPredictorSettings):
        super().__init__(settings)
        # ctc_decoder_params = deepcopy(settings.params.ctcDecoder.params)
        # lnp = LyricsNormalizationProcessor(LyricsNormalizationParams(LyricsNormalization.ONE_STRING))
        # if len(ctc_decoder_params.dictionary) > 0:
        #     ctc_decoder_params.dictionary[:] = [lnp.apply(word) for word in ctc_decoder_params.dictionary]
        # else:
        #     with open(os.path.join(BASE_DIR, 'internal_storage', 'resources', 'hyphen_dictionary.txt')) as f:
        #         # TODO: dataset params in settings, that we can create the correct normalization params
        #         ctc_decoder_params.dictionary[:] = [lnp.apply(line.split()[0]) for line in f.readlines()]

        # self.predictor = MultiPredictor(glob_all([s + '/text_best*.ckpt.json' for s in params.checkpoints]))
        voter_params = VoterParams()
        voter_params.type = VoterParams.type.ConfidenceVoterDefaultCTC
        self.predictor = MultiPredictor.from_paths(
            checkpoints=glob_all([settings.model.local_file('text.ckpt.json')
                                  ]),
            voter_params=voter_params,
            predictor_params=PredictorParams(
                silent=True,
                progress_bar=True,
                pipeline=DataPipelineParams(batch_size=1,
                                            mode=PipelineMode("prediction"))))
        # self.height = self.predictor.predictors[0].network_params.features
        self.voter = voter_from_params(voter_params)
        self.dict_corrector = None

        if settings.params.useDictionaryCorrection:
            self.dict_corrector = DictionaryCorrector()
Exemplo n.º 8
0
 def test_predict_and_eval_uw3_with_voting(self):
     from calamari_ocr.test.test_train_file import uw3_trainer_params
     checkpoint = os.path.join(this_dir, "models", "best.ckpt")
     trainer_params = uw3_trainer_params(with_validation=True)
     args = PredictAndEvalArgs(
         checkpoint=[checkpoint, checkpoint, checkpoint],
         predictor=PredictorParams(pipeline=DataPipelineParams(
             batch_size=5)),
         data=trainer_params.gen.val_gen(),
     )
     main(args)
Exemplo n.º 9
0
 def test_predict_and_eval_hdf5(self):
     from calamari_ocr.test.test_train_hdf5 import default_trainer_params
     checkpoint = os.path.join(this_dir, "models", "best.ckpt")
     trainer_params = default_trainer_params(with_validation=True)
     args = PredictAndEvalArgs(
         checkpoint=[checkpoint],
         predictor=PredictorParams(pipeline=DataPipelineParams(
             num_processes=1)),
         data=trainer_params.gen.val_gen(),
     )
     main(args)
Exemplo n.º 10
0
    def test_standalone_pipeline(self):
        from tfaip.imports import DataBaseParams

        class TestDataParams(DataBaseParams):
            @staticmethod
            def cls():
                raise NotImplementedError

        data_params = TestDataParams()
        samples = [Sample()] * 100
        pipeline = data_params.pre_proc.create(
            DataPipelineParams(num_processes=8), data_params)
        for i, d in enumerate(pipeline.apply(samples)):
            print(i, d)
Exemplo n.º 11
0
    def test_data_composed_pipeline(self):
        numbers = list(range(10))
        target_numbers = []
        for n in numbers:
            target_numbers.append((n + 1 + 3))
            target_numbers.append((n + 1 + 3 + 7))
        target_numbers = [n * 2 + 1 for n in target_numbers]
        target_numbers = [n for n in target_numbers if n % 2 == 1]

        data_params = DataParams(
            pre_proc=ComposedProcessorPipelineParams(
                pipelines=[
                    SequentialProcessorPipelineParams(
                        run_parallel=False,
                        num_threads=3,
                        processors=[
                            AddProcessorParams(v=1),
                        ],
                    ),
                    SequentialProcessorPipelineParams(
                        num_threads=1,  # Deterministic
                        processors=[
                            # Test that generator with pre- and post-processor works
                            AddProcessorParams(v=3),
                            RepeatSampleProcessorParams(f=2, add_per_step=7),
                            MultiplyProcessorParams(f=2),
                        ],
                    ),
                    SequentialProcessorPipelineParams(
                        run_parallel=True,
                        processors=[
                            AddProcessorParams(v=1),
                        ],
                    ),
                    SequentialProcessorPipelineParams(
                        num_threads=1,  # Deterministic
                        processors=[
                            DropIfEvenProcessorParams(),
                        ],
                    ),
                ]
            )
        )
        data = data_params.create()
        with data.create_pipeline(
            DataPipelineParams(mode=PipelineMode.TRAINING, use_shared_memory=True),
            SimpleDataGeneratorParams(numbers_to_generate=numbers),
        ).generate_input_samples(auto_repeat=False) as samples:
            out = [s.inputs for s in samples]
            self.assertListEqual(target_numbers, out)
Exemplo n.º 12
0
def run_test(test, parallel, n_numbers=100):
    numbers = list(range(n_numbers))
    target_numbers = []
    for n in numbers:
        target_numbers.append((n + 1 + 3))
    target_numbers = [n * 3 + 1 for n in target_numbers]
    target_numbers = groups_into_samples(target_numbers)

    data_params = DataParams(
        pre_proc=SequentialProcessorPipelineParams(
            run_parallel=parallel,
            num_threads=3,
            processors=[
                AddProcessorParams(v=1),
                AddProcessorParams(v=3),
                MultiplyProcessorParams(f=3),
                AddProcessorParams(v=1),
                PrepareParams(),
            ],
        )
    )
    data = data_params.create()
    pipeline = data.create_pipeline(
        DataPipelineParams(mode=PipelineMode.TRAINING), BatchedDataGeneratorParams(numbers_to_generate=numbers)
    )

    with pipeline.generate_input_samples(auto_repeat=False) as batched_samples:
        # Test generate input samples
        batched_samples = list(batched_samples)

        out = [list(np.squeeze(x.inputs["n"], axis=-1)) for x in batched_samples]
        test.assertListEqual(target_numbers, out)

    with pipeline.generate_input_batches(auto_repeat=False) as batched_samples_ds:
        # Test dataset
        out = [list(np.squeeze(i["n"], axis=-1)) for i, t, m in batched_samples_ds]
        test.assertListEqual(target_numbers, out)

        for s in batched_samples_ds:
            s[2]["meta"] = np.array([[d[0].decode("utf-8")] for d in s[2]["meta"]])

        def check_equal(s1: dict, s2: dict):
            test.assertListEqual(list(s1.keys()), list(s2.keys()))
            for k in s1.keys():
                numpy.testing.assert_array_equal(s1[k], s2[k])

        for s1, s2 in zip(batched_samples, batched_samples_ds):
            check_equal(s1.inputs, s2[0])
            check_equal(s1.targets, s2[1])
            check_equal(s1.meta, s2[2])
Exemplo n.º 13
0
 def test_data_preload(self):
     numbers = list(range(10))
     data_params = DataParams(
         pre_proc=SequentialProcessorPipelineParams(
             max_tasks_per_process=10000,
             run_parallel=True,
             num_threads=1,  # deterministic
             processors=[
                 RepeatSampleProcessorParams(f=10, add_per_step=7),
             ],
         )
     )
     data = data_params.create()
     l = data.create_pipeline(
         DataPipelineParams(mode=PipelineMode.TRAINING), SimpleDataGeneratorParams(numbers_to_generate=numbers)
     ).preload_input_samples()
     out_numbers = [int(s.inputs) for s in l]
     self.assertListEqual(out_numbers, sum([[i + x * 7 for x in range(10)] for i in numbers], []))
Exemplo n.º 14
0
def run_pad_test(test, n_numbers=1000):
    numbers = list(range(n_numbers))
    data_params = DataPadParams()
    data = data_params.create()

    def to_tuple(s):
        return s.inputs, s.targets, s.meta

    pipeline = data.create_pipeline(
        DataPipelineParams(mode=PipelineMode.TRAINING), BatchedPadDataGeneratorParams(numbers_to_generate=numbers)
    )
    # Test generate input samples, and dataset
    with pipeline.generate_input_samples(auto_repeat=False) as batched_samples, pipeline.generate_input_batches(
        auto_repeat=False
    ) as batched_samples_with_ds:
        batched_samples_with_ds = list(batched_samples_with_ds)
        for (i1, t1, m1), (i2, t2, m2) in zip(map(to_tuple, batched_samples), batched_samples_with_ds):
            np.testing.assert_array_equal(i1["n"], i2["n"])
            np.testing.assert_array_equal(t1["n"], t2["n"])
Exemplo n.º 15
0
 def test_model_zoo(self):
     version = "1.0"
     url = f"https://github.com/Calamari-OCR/calamari_models/archive/{version}.tar.gz"
     with tempfile.TemporaryDirectory() as d:
         d = "model_archive_permanent"  # for debugging
         os.makedirs(d, exist_ok=True)
         os.chdir(d)
         if not os.path.exists("calamari_models"):
             check_call(
                 [
                     "sh",
                     "-c",
                     " ".join(
                         [
                             "wget",
                             "-q",
                             "-O",
                             "-",
                             url,
                             "|",
                             "tar",
                             "xz",
                             "&&",
                             "mv",
                             f"calamari_models-{version}",
                             "calamari_models",
                         ]
                     ),
                 ]
             )
         trainer_params = uw3_trainer_params(with_validation=True)
         args = PredictAndEvalArgs(
             checkpoint=glob(os.path.join("calamari_models", "antiqua_modern", "*.ckpt.json")),
             predictor=PredictorParams(pipeline=DataPipelineParams(batch_size=5)),
             data=trainer_params.gen.val_gen(),
         )
         full_evaluation = predict_and_eval_main(args)
         self.assertLess(
             full_evaluation["voted"]["eval"]["avg_ler"],
             0.001,
             "The accuracy on the test data must be below 0.1%",
         )
Exemplo n.º 16
0
 def default_trainer_params(cls):
     p = super().default_trainer_params()
     p.gen.setup.train = DataPipelineParams(batch_size=1)
     p.gen.setup.val = DataPipelineParams(limit=5, batch_size=1)
     p.scenario.data.pre_proc.run_parallel = False
     return p