Пример #1
0
 def test_resolve_image_grayscale(self):
     img_url = assets.url_of('kant_aufklaerung_1784-binarized/data/OCR-D-IMG-NRM/OCR-D-IMG-NRM_0017.png')
     workspace = self.resolver.workspace_from_url(assets.url_of('SBB0000F29300010000/data/mets.xml'))
     img_pil1 = workspace.resolve_image_as_pil(img_url)
     self.assertEqual(img_pil1.size, (1457, 2083))
     img_pil2 = workspace._resolve_image_as_pil(img_url, [[0, 0], [1, 1]])
     self.assertEqual(img_pil2.size, (1, 1))
Пример #2
0
 def test_copies_ok(self):
     with copy_of_directory(assets.url_of('SBB0000F29300010000/data')) as wsdir:
         workspace = Workspace(Resolver(), wsdir)
         input_files = workspace.mets.find_files(fileGrp='OCR-D-IMG')
         self.assertEqual(len(input_files), 3)
         output_files = workspace.mets.find_files(fileGrp='OUTPUT')
         self.assertEqual(len(output_files), 0)
         run_processor(
             DummyProcessor,
             input_file_grp='OCR-D-IMG',
             output_file_grp='OUTPUT',
             workspace=workspace
         )
         output_files = workspace.mets.find_files(fileGrp='OUTPUT')
         output_files.sort(key=lambda x: x.url)
         print([str(s) for s in output_files])
         self.assertEqual(output_files[0].url, 'OUTPUT/OUTPUT_0001.tif')
         self.assertEqual(output_files[1].url, 'OUTPUT/OUTPUT_0001.xml')
         self.assertEqual(page_from_file(output_files[1]).pcGtsId, output_files[1].ID)
         self.assertEqual(page_from_file(output_files[1]).get_Page().imageFilename, output_files[0].url)
         self.assertEqual(len(output_files), 6)
         self.assertEqual(len(workspace.mets.find_files(ID='//OUTPUT.*')), 6)
         self.assertEqual(len(workspace.mets.find_files(ID='//OUTPUT.*_PAGE')), 3)
         self.assertEqual(len(workspace.mets.find_files(fileGrp='OUTPUT', mimetype=MIMETYPE_PAGE)), 3)
         run_processor(
             DummyProcessor,
             input_file_grp='OUTPUT',
             output_file_grp='OUTPUT2',
             workspace=workspace
         )
         output2_files = workspace.mets.find_files(fileGrp='OUTPUT2')
         output2_files.sort(key=lambda x: x.url)
         self.assertEqual(len(output2_files), 3)
 def test_validate_ocrd_file(self):
     resolver = Resolver()
     workspace = resolver.workspace_from_url(assets.url_of('glyph-consistency/data/mets.xml'))
     with pushd_popd(workspace.directory):
         ocrd_file = workspace.mets.find_all_files(ID="FAULTY_GLYPHS_FILE")[0]
         report = PageValidator.validate(ocrd_file=ocrd_file)
         self.assertEqual(len([e for e in report.errors if isinstance(e, ConsistencyError)]), 17, '17 textequiv consistency errors')
Пример #4
0
 def test_resolve_image_bitonal(self):
     img_url = assets.url_of('kant_aufklaerung_1784-binarized/data/OCR-D-IMG-1BIT/OCR-D-IMG-1BIT_0017.png')
     workspace = self.resolver.workspace_from_url(METS_HEROLD)
     img_pil1 = workspace._resolve_image_as_pil(img_url)
     self.assertEqual(img_pil1.size, (1457, 2083))
     img_pil2 = workspace._resolve_image_as_pil(img_url, [[0, 0], [1, 1]])
     self.assertEqual(img_pil2.size, (1, 1))
 def test_check_file_grp_basic(self):
     workspace = self.resolver.workspace_from_url(
         assets.url_of('SBB0000F29300010000/data/mets.xml'))
     report = WorkspaceValidator.check_file_grp(workspace, 'foo', 'bar')
     self.assertFalse(report.is_valid)
     self.assertEqual(len(report.errors), 1)
     self.assertEqual(report.errors[0],
                      "Input fileGrp[@USE='foo'] not in METS!")
     report = WorkspaceValidator.check_file_grp(workspace, 'OCR-D-IMG',
                                                'OCR-D-IMG-BIN')
     self.assertFalse(report.is_valid)
     self.assertEqual(len(report.errors), 1)
     self.assertEqual(
         report.errors[0],
         "Output fileGrp[@USE='OCR-D-IMG-BIN'] already in METS!")
     report = WorkspaceValidator.check_file_grp(workspace, 'OCR-D-IMG,FOO',
                                                'FOO')
     self.assertFalse(report.is_valid)
     self.assertEqual(len(report.errors), 1)
     self.assertEqual(report.errors[0],
                      "Input fileGrp[@USE='FOO'] not in METS!")
     report = WorkspaceValidator.check_file_grp(workspace, 'OCR-D-IMG,FOO',
                                                None)
     self.assertFalse(report.is_valid)
     self.assertEqual(len(report.errors), 1)
     self.assertEqual(report.errors[0],
                      "Input fileGrp[@USE='FOO'] not in METS!")
     report = WorkspaceValidator.check_file_grp(workspace, None, '')
     self.assertTrue(report.is_valid)
 def test_check_file_grp_page_id_valid(self):
     workspace = self.resolver.workspace_from_url(
         assets.url_of('SBB0000F29300010000/data/mets.xml'))
     report = WorkspaceValidator.check_file_grp(workspace,
                                                'OCR-D-IMG',
                                                'OCR-D-IMG-BIN',
                                                page_id='PHYS_0004')
     self.assertTrue(report.is_valid)
Пример #7
0
 def test_no_input_file_grp(self):
     processor = run_processor(
         DummyProcessor,
         resolver=self.resolver,
         mets_url=assets.url_of('SBB0000F29300010000/data/mets.xml'))
     with self.assertRaisesRegex(Exception,
                                 'Processor is missing input fileGrp'):
         _ = processor.input_files
Пример #8
0
 def test_validate_ocrd_file(self):
     resolver = Resolver()
     workspace = resolver.workspace_from_url(assets.url_of('glyph-consistency/data/mets.xml'))
     ocrd_file = workspace.mets.find_files(ID="FAULTY_GLYPHS_FILE")[0]
     if not ocrd_file.local_filename:
         workspace.download_file(ocrd_file)
     report = PageValidator.validate(ocrd_file=ocrd_file)
     self.assertEqual(len(report.errors), 17, 'errors')
Пример #9
0
 def test_validate_twice(self):
     validator = WorkspaceValidator(
         self.resolver,
         assets.url_of('SBB0000F29300010000/data/mets_one_file.xml'),
         download=True)
     report = validator._validate()  # pylint: disable=protected-access
     report = validator._validate()  # pylint: disable=protected-access
     self.assertTrue(report.is_valid)
Пример #10
0
 def test_with_mets_url_input_files(self):
     processor = run_processor(
         DummyProcessor,
         resolver=self.resolver,
         mets_url=assets.url_of('SBB0000F29300010000/data/mets.xml'))
     self.assertEqual(len(processor.input_files), 20)
     self.assertTrue(
         all([f.mimetype == MIMETYPE_PAGE for f in processor.input_files]))
Пример #11
0
 def test_validate_ocrd_file(self):
     resolver = Resolver()
     workspace = resolver.workspace_from_url(
         assets.url_of('glyph-consistency/data/mets.xml'))
     with pushd_popd(workspace.directory):
         ocrd_file = workspace.mets.find_files(ID="FAULTY_GLYPHS_FILE")[0]
         report = PageValidator.validate(ocrd_file=ocrd_file)
         self.assertEqual(len(report.errors), 17, 'errors')
Пример #12
0
 def test_resolve_image_bitonal(self):
     workspace = self.resolver.workspace_from_url(
         pjoin(assets.url_of('kant_aufklaerung_1784-binarized'),
               'data/mets.xml'))
     img_url = 'OCR-D-IMG-1BIT/OCR-D-IMG-1BIT_0017.png'
     img_pil1 = workspace._resolve_image_as_pil(img_url)
     self.assertEqual(img_pil1.size, (1457, 2083))
     img_pil2 = workspace._resolve_image_as_pil(img_url, [[0, 0], [1, 1]])
     self.assertEqual(img_pil2.size, (1, 1))
Пример #13
0
 def test_run_cli(self):
     with TemporaryDirectory() as tempdir:
         run_cli(
             'echo',
             mets_url=assets.url_of('SBB0000F29300010000/data/mets.xml'),
             resolver=Resolver(),
             workspace=None,
             page_id='page1',
             log_level='DEBUG',
             input_file_grp='INPUT',
             output_file_grp='OUTPUT',
             parameter='/path/to/param.json',
             working_dir=tempdir)
         run_cli(
             'echo',
             mets_url=assets.url_of('SBB0000F29300010000/data/mets.xml'),
             resolver=Resolver(),
         )
Пример #14
0
def test_resolve_image_as_pil_deprecated():
    url_path = os.path.join(assets.url_of('kant_aufklaerung_1784-binarized'), 'data/mets.xml')
    workspace = Resolver().workspace_from_url(url_path)
    with pytest.warns(DeprecationWarning) as record:
        workspace.resolve_image_as_pil('OCR-D-IMG-NRM/OCR-D-IMG-NRM_0017.png')

    # assert
    assert len(record) == 1
    assert 'Call to deprecated method resolve_image_as_pil.' in str(record[0].message)
 def test_check_file_grp_page_id_list(self):
     workspace = self.resolver.workspace_from_url(
         assets.url_of('SBB0000F29300010000/data/mets.xml'))
     report = WorkspaceValidator.check_file_grp(
         workspace,
         'OCR-D-IMG',
         'OCR-D-IMG-BIN',
         page_id=['PHYS_0003', 'PHYS_0001'])
     self.assertFalse(report.is_valid)
     self.assertEqual(len(report.errors), 1)
Пример #16
0
 def test_run1(self):
     resolver = Resolver()
     workspace = resolver.workspace_from_url(assets.url_of('kant_aufklaerung_1784-binarized/data/mets.xml'), dst_dir=WORKSPACE_DIR)
     proc = KrakenSegment(
         workspace,
         input_file_grp="OCR-D-IMG-BIN",
         output_file_grp="OCR-D-SEG-LINE-KRAKEN",
         parameter={'level-of-operation': 'line'}
     )
     proc.process()
     workspace.save_mets()
Пример #17
0
 def test_param_json(self):
     resolver = Resolver()
     workspace = resolver.workspace_from_url(
         assets.url_of('SBB0000F29300010000/data/mets_one_file.xml'),
         dst_dir=WORKSPACE_DIR)
     run_processor(KrakenOcr,
                   resolver=resolver,
                   workspace=workspace,
                   input_file_grp="INPUT",
                   output_file_grp="OCR-D-OCR-KRAKEN")
     workspace.save_mets()
Пример #18
0
 def test_parameter_url(self):
     with TemporaryDirectory() as tempdir:
         jsonpath = join(tempdir, 'params.json')
         with open(jsonpath, 'w') as f:
             f.write('{}')
         processor = run_processor(
             DummyProcessor,
             parameter='file://%s' % jsonpath,
             resolver=self.resolver,
             mets_url=assets.url_of('SBB0000F29300010000/data/mets.xml')
         )
         self.assertEqual(len(processor.input_files), 35)
Пример #19
0
 def test_parameter(self):
     with TemporaryDirectory() as tempdir:
         jsonpath = join(tempdir, 'params.json')
         with open(jsonpath, 'w') as f:
             f.write('{"baz": "quux"}')
         with open(jsonpath, 'r') as f:
             processor = run_processor(
                 DummyProcessor,
                 parameter=json.load(f),
                 resolver=self.resolver,
                 mets_url=assets.url_of(
                     'SBB0000F29300010000/data/mets.xml'))
         self.assertEqual(len(processor.input_files), 20)
 def test_check_file_grp_page_id_str(self):
     workspace = self.resolver.workspace_from_url(
         assets.url_of('SBB0000F29300010000/data/mets.xml'))
     report = WorkspaceValidator.check_file_grp(
         workspace,
         'OCR-D-IMG',
         'OCR-D-IMG-BIN',
         page_id='PHYS_0003,PHYS_0001')
     self.assertFalse(report.is_valid)
     self.assertEqual(len(report.errors), 1)
     self.assertEqual(
         report.errors[0],
         "Output fileGrp[@USE='OCR-D-IMG-BIN'] already contains output for page PHYS_0001"
     )
Пример #21
0
    def testProcessorProfiling(self):
        initLogging()
        log_capture_string = FIFOIO(256)
        ch = logging.StreamHandler(log_capture_string)
        ch.setFormatter(logging.Formatter(LOG_FORMAT))
        getLogger('ocrd.process.profile').setLevel('DEBUG')
        getLogger('ocrd.process.profile').addHandler(ch)

        run_processor(DummyProcessor, resolver=Resolver(), mets_url=assets.url_of('SBB0000F29300010000/data/mets.xml'))

        log_contents = log_capture_string.getvalue()
        log_capture_string.close()
        # with open('/tmp/debug.log', 'w') as f:
        #     f.write(log_contents)
        # Check whether profile information has been logged. Dummy should finish in under 0.1s
        self.assertTrue(match(r'.*Executing processor \'ocrd-test\' took 0.\d+s.*', log_contents))
Пример #22
0
 def setUp(self):
     super().setUp()
     self.mets = OcrdMets(filename=assets.url_of('SBB0000F29300010000/data/mets.xml'))
Пример #23
0
import os
from os.path import join, exists
from shutil import copytree, rmtree
from re import sub
from tempfile import TemporaryDirectory

from tests.base import TestCase, assets, main

from ocrd.resolver import Resolver
from ocrd.workspace import Workspace
#  from ocrd_utils.logging import setOverrideLogLevel
#  setOverrideLogLevel('DEBUG')

TMP_FOLDER = '/tmp/test-pyocrd-resolver'
METS_HEROLD = assets.url_of('SBB0000F29300010000/data/mets.xml')
FOLDER_KANT = assets.path_to('kant_aufklaerung_1784')
TEST_ZIP = assets.path_to('test.ocrd.zip')
oldpwd = os.getcwd()

# pylint: disable=redundant-unittest-assert, broad-except, deprecated-method, too-many-public-methods


class TestResolver(TestCase):
    def setUp(self):
        self.resolver = Resolver()
        self.folder = join(TMP_FOLDER, 'kant_aufklaerung_1784')
        if exists(TMP_FOLDER):
            rmtree(TMP_FOLDER)
            os.makedirs(TMP_FOLDER)
        copytree(FOLDER_KANT, self.folder)
Пример #24
0
 def setUp(self):
     self.resolver = Resolver()
     self.workspace = self.resolver.workspace_from_url(
         assets.url_of('SBB0000F29300010000/data/mets.xml'))
Пример #25
0
# pylint: disable=import-error

import os
import shutil

from tests.base import TestCase, assets, main

from ocrd import Resolver
from ocrd_kraken.binarize import KrakenBinarize
from ocrd_utils.logging import setOverrideLogLevel

setOverrideLogLevel('DEBUG')

PARAM_JSON = assets.url_of('param-binarize.json')

WORKSPACE_DIR = '/tmp/ocrd-kraken-binarize-test'


class TestKrakenBinarize(TestCase):
    def setUp(self):
        if os.path.exists(WORKSPACE_DIR):
            shutil.rmtree(WORKSPACE_DIR)
        os.makedirs(WORKSPACE_DIR)

    #  def test_param_json(self):
    #      resolver = Resolver()
    #      workspace =  resolver.workspace_from_url(assets.url_of('SBB0000F29300010000/data/mets_one_file.xml'), dst_dir=WORKSPACE_DIR)
    #      run_processor(
    #          KrakenBinarize,
    #          resolver=resolver,
    #          workspace=workspace,
Пример #26
0
 def setUp(self):
     self.mets = OcrdMets(
         filename=assets.url_of('SBB0000F29300010000/data/mets.xml'))
     initLogging()
Пример #27
0
# pylint: disable=import-error

import os
import shutil

from tests.base import TestCase, assets, main

from ocrd.resolver import Resolver
from ocrd_kraken.segment import KrakenSegment
PARAM_JSON = assets.url_of('param-segment.json')

WORKSPACE_DIR = '/tmp/ocrd-ocropy-segment-test'

class TestKrakenSegment(TestCase):

    def setUp(self):
        if os.path.exists(WORKSPACE_DIR):
            shutil.rmtree(WORKSPACE_DIR)
        os.makedirs(WORKSPACE_DIR)

    def test_run1(self):
        resolver = Resolver()
        workspace = resolver.workspace_from_url(assets.url_of('kant_aufklaerung_1784-binarized/data/mets.xml'), dst_dir=WORKSPACE_DIR)
        proc = KrakenSegment(
            workspace,
            input_file_grp="OCR-D-IMG-BIN",
            output_file_grp="OCR-D-SEG-LINE-KRAKEN",
            parameter={'level-of-operation': 'line'}
        )
        proc.process()
        workspace.save_mets()
Пример #28
0
def test_resolve_image_as_pil(image_url, size_pil):
    url_path = assets.url_of('kant_aufklaerung_1784-binarized/data/mets.xml')
    workspace = Resolver().workspace_from_url(url_path)
    img_pil = workspace._resolve_image_as_pil(image_url, [[0, 0], [1, 1]])
    assert img_pil.size == size_pil
 def test_simple(self):
     report = WorkspaceValidator.validate(
         self.resolver,
         assets.url_of('SBB0000F29300010000/data/mets_one_file.xml'),
         download=True)
     self.assertTrue(report.is_valid)
Пример #30
0
 def test_with_mets_url_input_files(self):
     processor = run_processor(
         DummyProcessor,
         resolver=self.resolver,
         mets_url=assets.url_of('SBB0000F29300010000/data/mets.xml'))
     self.assertEqual(len(processor.input_files), 35)