Exemplo n.º 1
0
 def test_resolve_image_grayscale(self):
     img_url = assets.url_of('kant_aufklaerung_1784-binarized/data/OCR-D-IMG-NRM/OCR-D-IMG-NRM_0017.png')
     workspace = self.resolver.workspace_from_url(assets.url_of('SBB0000F29300010000/data/mets.xml'))
     img_pil1 = workspace.resolve_image_as_pil(img_url)
     self.assertEqual(img_pil1.size, (1457, 2083))
     img_pil2 = workspace._resolve_image_as_pil(img_url, [[0, 0], [1, 1]])
     self.assertEqual(img_pil2.size, (1, 1))
Exemplo n.º 2
0
 def test_copies_ok(self):
     with copy_of_directory(assets.url_of('SBB0000F29300010000/data')) as wsdir:
         workspace = Workspace(Resolver(), wsdir)
         input_files = workspace.mets.find_files(fileGrp='OCR-D-IMG')
         self.assertEqual(len(input_files), 3)
         output_files = workspace.mets.find_files(fileGrp='OUTPUT')
         self.assertEqual(len(output_files), 0)
         run_processor(
             DummyProcessor,
             input_file_grp='OCR-D-IMG',
             output_file_grp='OUTPUT',
             workspace=workspace
         )
         output_files = workspace.mets.find_files(fileGrp='OUTPUT')
         output_files.sort(key=lambda x: x.url)
         print([str(s) for s in output_files])
         self.assertEqual(output_files[0].url, 'OUTPUT/OUTPUT_0001.tif')
         self.assertEqual(output_files[1].url, 'OUTPUT/OUTPUT_0001.xml')
         self.assertEqual(page_from_file(output_files[1]).pcGtsId, output_files[1].ID)
         self.assertEqual(page_from_file(output_files[1]).get_Page().imageFilename, output_files[0].url)
         self.assertEqual(len(output_files), 6)
         self.assertEqual(len(workspace.mets.find_files(ID='//OUTPUT.*')), 6)
         self.assertEqual(len(workspace.mets.find_files(ID='//OUTPUT.*_PAGE')), 3)
         self.assertEqual(len(workspace.mets.find_files(fileGrp='OUTPUT', mimetype=MIMETYPE_PAGE)), 3)
         run_processor(
             DummyProcessor,
             input_file_grp='OUTPUT',
             output_file_grp='OUTPUT2',
             workspace=workspace
         )
         output2_files = workspace.mets.find_files(fileGrp='OUTPUT2')
         output2_files.sort(key=lambda x: x.url)
         self.assertEqual(len(output2_files), 3)
 def test_validate_ocrd_file(self):
     resolver = Resolver()
     workspace = resolver.workspace_from_url(assets.url_of('glyph-consistency/data/mets.xml'))
     with pushd_popd(workspace.directory):
         ocrd_file = workspace.mets.find_all_files(ID="FAULTY_GLYPHS_FILE")[0]
         report = PageValidator.validate(ocrd_file=ocrd_file)
         self.assertEqual(len([e for e in report.errors if isinstance(e, ConsistencyError)]), 17, '17 textequiv consistency errors')
Exemplo n.º 4
0
 def test_resolve_image_bitonal(self):
     img_url = assets.url_of('kant_aufklaerung_1784-binarized/data/OCR-D-IMG-1BIT/OCR-D-IMG-1BIT_0017.png')
     workspace = self.resolver.workspace_from_url(METS_HEROLD)
     img_pil1 = workspace._resolve_image_as_pil(img_url)
     self.assertEqual(img_pil1.size, (1457, 2083))
     img_pil2 = workspace._resolve_image_as_pil(img_url, [[0, 0], [1, 1]])
     self.assertEqual(img_pil2.size, (1, 1))
 def test_check_file_grp_basic(self):
     workspace = self.resolver.workspace_from_url(
         assets.url_of('SBB0000F29300010000/data/mets.xml'))
     report = WorkspaceValidator.check_file_grp(workspace, 'foo', 'bar')
     self.assertFalse(report.is_valid)
     self.assertEqual(len(report.errors), 1)
     self.assertEqual(report.errors[0],
                      "Input fileGrp[@USE='foo'] not in METS!")
     report = WorkspaceValidator.check_file_grp(workspace, 'OCR-D-IMG',
                                                'OCR-D-IMG-BIN')
     self.assertFalse(report.is_valid)
     self.assertEqual(len(report.errors), 1)
     self.assertEqual(
         report.errors[0],
         "Output fileGrp[@USE='OCR-D-IMG-BIN'] already in METS!")
     report = WorkspaceValidator.check_file_grp(workspace, 'OCR-D-IMG,FOO',
                                                'FOO')
     self.assertFalse(report.is_valid)
     self.assertEqual(len(report.errors), 1)
     self.assertEqual(report.errors[0],
                      "Input fileGrp[@USE='FOO'] not in METS!")
     report = WorkspaceValidator.check_file_grp(workspace, 'OCR-D-IMG,FOO',
                                                None)
     self.assertFalse(report.is_valid)
     self.assertEqual(len(report.errors), 1)
     self.assertEqual(report.errors[0],
                      "Input fileGrp[@USE='FOO'] not in METS!")
     report = WorkspaceValidator.check_file_grp(workspace, None, '')
     self.assertTrue(report.is_valid)
 def test_check_file_grp_page_id_valid(self):
     workspace = self.resolver.workspace_from_url(
         assets.url_of('SBB0000F29300010000/data/mets.xml'))
     report = WorkspaceValidator.check_file_grp(workspace,
                                                'OCR-D-IMG',
                                                'OCR-D-IMG-BIN',
                                                page_id='PHYS_0004')
     self.assertTrue(report.is_valid)
Exemplo n.º 7
0
 def test_no_input_file_grp(self):
     processor = run_processor(
         DummyProcessor,
         resolver=self.resolver,
         mets_url=assets.url_of('SBB0000F29300010000/data/mets.xml'))
     with self.assertRaisesRegex(Exception,
                                 'Processor is missing input fileGrp'):
         _ = processor.input_files
Exemplo n.º 8
0
 def test_validate_ocrd_file(self):
     resolver = Resolver()
     workspace = resolver.workspace_from_url(assets.url_of('glyph-consistency/data/mets.xml'))
     ocrd_file = workspace.mets.find_files(ID="FAULTY_GLYPHS_FILE")[0]
     if not ocrd_file.local_filename:
         workspace.download_file(ocrd_file)
     report = PageValidator.validate(ocrd_file=ocrd_file)
     self.assertEqual(len(report.errors), 17, 'errors')
Exemplo n.º 9
0
 def test_validate_twice(self):
     validator = WorkspaceValidator(
         self.resolver,
         assets.url_of('SBB0000F29300010000/data/mets_one_file.xml'),
         download=True)
     report = validator._validate()  # pylint: disable=protected-access
     report = validator._validate()  # pylint: disable=protected-access
     self.assertTrue(report.is_valid)
Exemplo n.º 10
0
 def test_with_mets_url_input_files(self):
     processor = run_processor(
         DummyProcessor,
         resolver=self.resolver,
         mets_url=assets.url_of('SBB0000F29300010000/data/mets.xml'))
     self.assertEqual(len(processor.input_files), 20)
     self.assertTrue(
         all([f.mimetype == MIMETYPE_PAGE for f in processor.input_files]))
Exemplo n.º 11
0
 def test_validate_ocrd_file(self):
     resolver = Resolver()
     workspace = resolver.workspace_from_url(
         assets.url_of('glyph-consistency/data/mets.xml'))
     with pushd_popd(workspace.directory):
         ocrd_file = workspace.mets.find_files(ID="FAULTY_GLYPHS_FILE")[0]
         report = PageValidator.validate(ocrd_file=ocrd_file)
         self.assertEqual(len(report.errors), 17, 'errors')
Exemplo n.º 12
0
 def test_resolve_image_bitonal(self):
     workspace = self.resolver.workspace_from_url(
         pjoin(assets.url_of('kant_aufklaerung_1784-binarized'),
               'data/mets.xml'))
     img_url = 'OCR-D-IMG-1BIT/OCR-D-IMG-1BIT_0017.png'
     img_pil1 = workspace._resolve_image_as_pil(img_url)
     self.assertEqual(img_pil1.size, (1457, 2083))
     img_pil2 = workspace._resolve_image_as_pil(img_url, [[0, 0], [1, 1]])
     self.assertEqual(img_pil2.size, (1, 1))
Exemplo n.º 13
0
 def test_run_cli(self):
     with TemporaryDirectory() as tempdir:
         run_cli(
             'echo',
             mets_url=assets.url_of('SBB0000F29300010000/data/mets.xml'),
             resolver=Resolver(),
             workspace=None,
             page_id='page1',
             log_level='DEBUG',
             input_file_grp='INPUT',
             output_file_grp='OUTPUT',
             parameter='/path/to/param.json',
             working_dir=tempdir)
         run_cli(
             'echo',
             mets_url=assets.url_of('SBB0000F29300010000/data/mets.xml'),
             resolver=Resolver(),
         )
Exemplo n.º 14
0
def test_resolve_image_as_pil_deprecated():
    url_path = os.path.join(assets.url_of('kant_aufklaerung_1784-binarized'), 'data/mets.xml')
    workspace = Resolver().workspace_from_url(url_path)
    with pytest.warns(DeprecationWarning) as record:
        workspace.resolve_image_as_pil('OCR-D-IMG-NRM/OCR-D-IMG-NRM_0017.png')

    # assert
    assert len(record) == 1
    assert 'Call to deprecated method resolve_image_as_pil.' in str(record[0].message)
 def test_check_file_grp_page_id_list(self):
     workspace = self.resolver.workspace_from_url(
         assets.url_of('SBB0000F29300010000/data/mets.xml'))
     report = WorkspaceValidator.check_file_grp(
         workspace,
         'OCR-D-IMG',
         'OCR-D-IMG-BIN',
         page_id=['PHYS_0003', 'PHYS_0001'])
     self.assertFalse(report.is_valid)
     self.assertEqual(len(report.errors), 1)
Exemplo n.º 16
0
 def test_run1(self):
     resolver = Resolver()
     workspace = resolver.workspace_from_url(assets.url_of('kant_aufklaerung_1784-binarized/data/mets.xml'), dst_dir=WORKSPACE_DIR)
     proc = KrakenSegment(
         workspace,
         input_file_grp="OCR-D-IMG-BIN",
         output_file_grp="OCR-D-SEG-LINE-KRAKEN",
         parameter={'level-of-operation': 'line'}
     )
     proc.process()
     workspace.save_mets()
Exemplo n.º 17
0
 def test_param_json(self):
     resolver = Resolver()
     workspace = resolver.workspace_from_url(
         assets.url_of('SBB0000F29300010000/data/mets_one_file.xml'),
         dst_dir=WORKSPACE_DIR)
     run_processor(KrakenOcr,
                   resolver=resolver,
                   workspace=workspace,
                   input_file_grp="INPUT",
                   output_file_grp="OCR-D-OCR-KRAKEN")
     workspace.save_mets()
Exemplo n.º 18
0
 def test_parameter_url(self):
     with TemporaryDirectory() as tempdir:
         jsonpath = join(tempdir, 'params.json')
         with open(jsonpath, 'w') as f:
             f.write('{}')
         processor = run_processor(
             DummyProcessor,
             parameter='file://%s' % jsonpath,
             resolver=self.resolver,
             mets_url=assets.url_of('SBB0000F29300010000/data/mets.xml')
         )
         self.assertEqual(len(processor.input_files), 35)
Exemplo n.º 19
0
 def test_parameter(self):
     with TemporaryDirectory() as tempdir:
         jsonpath = join(tempdir, 'params.json')
         with open(jsonpath, 'w') as f:
             f.write('{"baz": "quux"}')
         with open(jsonpath, 'r') as f:
             processor = run_processor(
                 DummyProcessor,
                 parameter=json.load(f),
                 resolver=self.resolver,
                 mets_url=assets.url_of(
                     'SBB0000F29300010000/data/mets.xml'))
         self.assertEqual(len(processor.input_files), 20)
 def test_check_file_grp_page_id_str(self):
     workspace = self.resolver.workspace_from_url(
         assets.url_of('SBB0000F29300010000/data/mets.xml'))
     report = WorkspaceValidator.check_file_grp(
         workspace,
         'OCR-D-IMG',
         'OCR-D-IMG-BIN',
         page_id='PHYS_0003,PHYS_0001')
     self.assertFalse(report.is_valid)
     self.assertEqual(len(report.errors), 1)
     self.assertEqual(
         report.errors[0],
         "Output fileGrp[@USE='OCR-D-IMG-BIN'] already contains output for page PHYS_0001"
     )
Exemplo n.º 21
0
    def testProcessorProfiling(self):
        initLogging()
        log_capture_string = FIFOIO(256)
        ch = logging.StreamHandler(log_capture_string)
        ch.setFormatter(logging.Formatter(LOG_FORMAT))
        getLogger('ocrd.process.profile').setLevel('DEBUG')
        getLogger('ocrd.process.profile').addHandler(ch)

        run_processor(DummyProcessor, resolver=Resolver(), mets_url=assets.url_of('SBB0000F29300010000/data/mets.xml'))

        log_contents = log_capture_string.getvalue()
        log_capture_string.close()
        # with open('/tmp/debug.log', 'w') as f:
        #     f.write(log_contents)
        # Check whether profile information has been logged. Dummy should finish in under 0.1s
        self.assertTrue(match(r'.*Executing processor \'ocrd-test\' took 0.\d+s.*', log_contents))
Exemplo n.º 22
0
 def setUp(self):
     super().setUp()
     self.mets = OcrdMets(filename=assets.url_of('SBB0000F29300010000/data/mets.xml'))
Exemplo n.º 23
0
import os
from os.path import join, exists
from shutil import copytree, rmtree
from re import sub
from tempfile import TemporaryDirectory

from tests.base import TestCase, assets, main

from ocrd.resolver import Resolver
from ocrd.workspace import Workspace
#  from ocrd_utils.logging import setOverrideLogLevel
#  setOverrideLogLevel('DEBUG')

TMP_FOLDER = '/tmp/test-pyocrd-resolver'
METS_HEROLD = assets.url_of('SBB0000F29300010000/data/mets.xml')
FOLDER_KANT = assets.path_to('kant_aufklaerung_1784')
TEST_ZIP = assets.path_to('test.ocrd.zip')
oldpwd = os.getcwd()

# pylint: disable=redundant-unittest-assert, broad-except, deprecated-method, too-many-public-methods


class TestResolver(TestCase):
    def setUp(self):
        self.resolver = Resolver()
        self.folder = join(TMP_FOLDER, 'kant_aufklaerung_1784')
        if exists(TMP_FOLDER):
            rmtree(TMP_FOLDER)
            os.makedirs(TMP_FOLDER)
        copytree(FOLDER_KANT, self.folder)
Exemplo n.º 24
0
 def setUp(self):
     self.resolver = Resolver()
     self.workspace = self.resolver.workspace_from_url(
         assets.url_of('SBB0000F29300010000/data/mets.xml'))
Exemplo n.º 25
0
# pylint: disable=import-error

import os
import shutil

from tests.base import TestCase, assets, main

from ocrd import Resolver
from ocrd_kraken.binarize import KrakenBinarize
from ocrd_utils.logging import setOverrideLogLevel

setOverrideLogLevel('DEBUG')

PARAM_JSON = assets.url_of('param-binarize.json')

WORKSPACE_DIR = '/tmp/ocrd-kraken-binarize-test'


class TestKrakenBinarize(TestCase):
    def setUp(self):
        if os.path.exists(WORKSPACE_DIR):
            shutil.rmtree(WORKSPACE_DIR)
        os.makedirs(WORKSPACE_DIR)

    #  def test_param_json(self):
    #      resolver = Resolver()
    #      workspace =  resolver.workspace_from_url(assets.url_of('SBB0000F29300010000/data/mets_one_file.xml'), dst_dir=WORKSPACE_DIR)
    #      run_processor(
    #          KrakenBinarize,
    #          resolver=resolver,
    #          workspace=workspace,
Exemplo n.º 26
0
 def setUp(self):
     self.mets = OcrdMets(
         filename=assets.url_of('SBB0000F29300010000/data/mets.xml'))
     initLogging()
Exemplo n.º 27
0
# pylint: disable=import-error

import os
import shutil

from tests.base import TestCase, assets, main

from ocrd.resolver import Resolver
from ocrd_kraken.segment import KrakenSegment
PARAM_JSON = assets.url_of('param-segment.json')

WORKSPACE_DIR = '/tmp/ocrd-ocropy-segment-test'

class TestKrakenSegment(TestCase):

    def setUp(self):
        if os.path.exists(WORKSPACE_DIR):
            shutil.rmtree(WORKSPACE_DIR)
        os.makedirs(WORKSPACE_DIR)

    def test_run1(self):
        resolver = Resolver()
        workspace = resolver.workspace_from_url(assets.url_of('kant_aufklaerung_1784-binarized/data/mets.xml'), dst_dir=WORKSPACE_DIR)
        proc = KrakenSegment(
            workspace,
            input_file_grp="OCR-D-IMG-BIN",
            output_file_grp="OCR-D-SEG-LINE-KRAKEN",
            parameter={'level-of-operation': 'line'}
        )
        proc.process()
        workspace.save_mets()
Exemplo n.º 28
0
def test_resolve_image_as_pil(image_url, size_pil):
    url_path = assets.url_of('kant_aufklaerung_1784-binarized/data/mets.xml')
    workspace = Resolver().workspace_from_url(url_path)
    img_pil = workspace._resolve_image_as_pil(image_url, [[0, 0], [1, 1]])
    assert img_pil.size == size_pil
 def test_simple(self):
     report = WorkspaceValidator.validate(
         self.resolver,
         assets.url_of('SBB0000F29300010000/data/mets_one_file.xml'),
         download=True)
     self.assertTrue(report.is_valid)
Exemplo n.º 30
0
 def test_with_mets_url_input_files(self):
     processor = run_processor(
         DummyProcessor,
         resolver=self.resolver,
         mets_url=assets.url_of('SBB0000F29300010000/data/mets.xml'))
     self.assertEqual(len(processor.input_files), 35)