예제 #1
0
 def runTest(self):
     resolver = Resolver()
     #  workspace = resolver.workspace_from_url(assets.url_of('SBB0000F29300010000/mets_one_file.xml'), directory=WORKSPACE_DIR)
     workspace = resolver.workspace_from_url(
         assets.url_of('kant_aufklaerung_1784-binarized/mets.xml'),
         directory=WORKSPACE_DIR)
     TesserocrSegmentRegion(workspace,
                            input_file_grp="OCR-D-IMG",
                            output_file_grp="OCR-D-SEG-BLOCK").process()
     TesserocrSegmentLine(workspace,
                          input_file_grp="OCR-D-SEG-BLOCK",
                          output_file_grp="OCR-D-SEG-LINE").process()
     TesserocrSegmentWord(workspace,
                          input_file_grp="OCR-D-SEG-LINE",
                          output_file_grp="OCR-D-SEG-WORD").process()
     workspace.save_mets()
예제 #2
0
 def runTest(self):
     resolver = Resolver(cache_enabled=True)
     #  workspace = resolver.workspace_from_url(assets.url_of('SBB0000F29300010000/mets_one_file.xml'), directory=WORKSPACE_DIR)
     workspace = resolver.workspace_from_url(assets.url_of(
         'kant_aufklaerung_1784-page-block-line-word/mets.xml'),
                                             directory=WORKSPACE_DIR)
     TesserocrSegmentRegion(workspace,
                            input_file_grp="OCR-D-IMG",
                            output_file_grp="OCR-D-SEG-BLOCK").process()
     workspace.save_mets()
     TesserocrSegmentLine(workspace,
                          input_file_grp="OCR-D-SEG-BLOCK",
                          output_file_grp="OCR-D-SEG-LINE").process()
     workspace.save_mets()
     TesserocrRecognize(workspace,
                        input_file_grp="OCR-D-SEG-LINE",
                        output_file_grp="OCR-D-OCR-TESS",
                        parameter={
                            'textequiv_level': 'word'
                        }).process()
     workspace.save_mets()
예제 #3
0
 def runTest(self):
     resolver = Resolver()
     #  workspace = resolver.workspace_from_url(assets.url_of('SBB0000F29300010000/mets_one_file.xml'), directory=WORKSPACE_DIR)
     workspace = resolver.workspace_from_url(assets.url_of(
         'kant_aufklaerung_1784-page-block-line-word/mets.xml'),
                                             directory=WORKSPACE_DIR)
     TesserocrSegmentRegion(workspace,
                            input_file_grp="OCR-D-IMG",
                            output_file_grp="OCR-D-SEG-BLOCK").process()
     workspace.save_mets()
     TesserocrSegmentLine(workspace,
                          input_file_grp="OCR-D-SEG-BLOCK",
                          output_file_grp="OCR-D-SEG-LINE").process()
     workspace.save_mets()
     TesserocrRecognize(
         workspace,
         input_file_grp="OCR-D-SEG-LINE",
         output_file_grp="OCR-D-OCR-TESS",
         parameter={
             'textequiv_level': 'line'
         }  # add dep tesseract-ocr-script-frak: , 'model': 'Fraktur'
     ).process()
     workspace.save_mets()
     TesserocrSegmentWord(workspace,
                          input_file_grp="OCR-D-SEG-LINE",
                          output_file_grp="OCR-D-SEG-WORD").process()
     workspace.save_mets()
     TesserocrRecognize(
         workspace,
         input_file_grp="OCR-D-SEG-WORD",
         output_file_grp="OCR-D-OCR-TESS-W2C",
         parameter={
             'textequiv_level': 'glyph'
         }  # add dep tesseract-ocr-script-frak: , 'model': 'Fraktur'}
     ).process()
     workspace.save_mets()
예제 #4
0
import os
import shutil

from test.base import TestCase, main, assets

from ocrd.resolver import Resolver
from ocrd_tesserocr import TesserocrSegmentRegion

METS_HEROLD_SMALL = assets.url_of('SBB0000F29300010000/data/mets_one_file.xml')

WORKSPACE_DIR = '/tmp/pyocrd-test-segment-region-tesserocr'


class TestTesserocrSegmentRegionTesseract(TestCase):
    def setUp(self):
        if os.path.exists(WORKSPACE_DIR):
            shutil.rmtree(WORKSPACE_DIR)
        os.makedirs(WORKSPACE_DIR)

    def runTest(self):
        resolver = Resolver()
        workspace = resolver.workspace_from_url(METS_HEROLD_SMALL,
                                                dst_dir=WORKSPACE_DIR)
        TesserocrSegmentRegion(workspace,
                               input_file_grp="OCR-D-IMG",
                               output_file_grp="OCR-D-SEG-BLOCK").process()
        workspace.save_mets()


if __name__ == '__main__':
    main()
예제 #5
0
# pylint: disable=import-error

import os
import shutil
from tempfile import TemporaryDirectory

from test.base import TestCase, assets, main

from ocrd.resolver import Resolver
from ocrd_ocropy.segment import OcropySegment

PARAM_JSON = assets.url_of('param-segment.json')

WORKSPACE_DIR = '/tmp/ocrd-ocropy-segment-test'


class TestOcropySegment(TestCase):
    def setUp(self):
        if os.path.exists(WORKSPACE_DIR):
            shutil.rmtree(WORKSPACE_DIR)
        os.makedirs(WORKSPACE_DIR)

    def test_run1(self):
        resolver = Resolver()
        with TemporaryDirectory() as tempdir:
            workspace = resolver.workspace_from_url(assets.path_to(
                'kant_aufklaerung_1784-binarized/data/mets.xml'),
                                                    dst_dir=tempdir)
            proc = OcropySegment(
                workspace,
                input_file_grp="OCR-D-IMG-BIN",
예제 #6
0
 def setUp(self):
     self.mets = OcrdMets(
         filename=assets.url_of('SBB0000F29300010000/mets.xml'))
예제 #7
0
 def setUp(self):
     self.resolver = Resolver()
     self.workspace = self.resolver.workspace_from_url(assets.url_of('SBB0000F29300010000/mets.xml'))
예제 #8
0
import os
import shutil

from test.base import TestCase, main, assets, skip

from ocrd.resolver import Resolver
from ocrd_tesserocr.segment_word import TesserocrSegmentWord
from ocrd_tesserocr.segment_line import TesserocrSegmentLine
from ocrd_tesserocr.segment_region import TesserocrSegmentRegion
from ocrd_tesserocr.recognize import TesserocrRecognize

#METS_HEROLD_SMALL = assets.url_of('SBB0000F29300010000/data/mets_one_file.xml')
METS_HEROLD_SMALL = assets.url_of('kant_aufklaerung_1784/data/mets.xml')

WORKSPACE_DIR = '/tmp/pyocrd-test-recognizer'


class TestTesserocrRecognize(TestCase):
    def setUp(self):
        if os.path.exists(WORKSPACE_DIR):
            shutil.rmtree(WORKSPACE_DIR)
        os.makedirs(WORKSPACE_DIR)

    #skip("Takes too long")
    def runTest(self):
        resolver = Resolver()
        workspace = resolver.workspace_from_url(METS_HEROLD_SMALL,
                                                dst_dir=WORKSPACE_DIR)
        TesserocrSegmentRegion(workspace,
                               input_file_grp="OCR-D-IMG",
                               output_file_grp="OCR-D-SEG-BLOCK").process()
 def runTest(self):
     report = WorkspaceValidator.validate_url(
         self.resolver,
         assets.url_of('SBB0000F29300010000/mets_one_file.xml'))
     print(report.to_xml())
예제 #10
0
 def runTest(self):
     resolver = Resolver(cache_enabled=True)
     workspace = resolver.workspace_from_url(
         assets.url_of('SBB0000F29300010000/mets_one_file.xml'))
     TesserocrSegmentRegion(workspace).process()
     workspace.save_mets()
예제 #11
0
import os
from shutil import copytree, rmtree

from ocrd.model import OcrdExif
from ocrd.resolver import Resolver
from test.base import TestCase, assets, main

TMP_FOLDER = '/tmp/test-pyocrd-resolver'
METS_HEROLD = assets.url_of('SBB0000F29300010000/mets.xml')
FOLDER_KANT = assets.url_of('kant_aufklaerung_1784')[len('file://'):]
TEST_ZIP = assets.url_of('test.ocrd.zip')[len('file://'):]

class TestResolver(TestCase):

    def setUp(self):
        self.resolver = Resolver(cache_enabled=True)
        self.folder = os.path.join(TMP_FOLDER, 'kant_aufklaerung_1784')
        if os.path.exists(TMP_FOLDER):
            rmtree(TMP_FOLDER)
            os.makedirs(TMP_FOLDER)
        copytree(FOLDER_KANT, self.folder)

    def test_unpack_workspace(self):
        workspace = self.resolver.unpack_workspace_from_filename(TEST_ZIP)
        files = workspace.mets.find_files(mimetype='image/tiff')
        self.assertEqual(len(files), 2, '2 TIF')
        for f in files:
            workspace.download_file(f)
        print([OcrdExif.from_filename(f.local_filename).to_xml() for f in files])

    def test_workspace_from_folder(self):
예제 #12
0
import os
from shutil import copytree, rmtree
from test.base import TestCase, assets, main

from ocrd.model import OcrdExif
from ocrd.resolver import Resolver

TMP_FOLDER = '/tmp/test-pyocrd-resolver'
METS_HEROLD = assets.url_of('SBB0000F29300010000/mets.xml')
FOLDER_KANT = assets.path_to('kant_aufklaerung_1784')
TEST_ZIP = assets.path_to('test.ocrd.zip')


class TestResolver(TestCase):
    def setUp(self):
        self.resolver = Resolver()
        self.folder = os.path.join(TMP_FOLDER, 'kant_aufklaerung_1784')
        if os.path.exists(TMP_FOLDER):
            rmtree(TMP_FOLDER)
            os.makedirs(TMP_FOLDER)
        copytree(FOLDER_KANT, self.folder)

    def test_workspace_from_url(self):
        workspace = self.resolver.workspace_from_url(METS_HEROLD)
        #  print(METS_HEROLD)
        #  print(workspace.mets)
        input_files = workspace.mets.find_files(fileGrp='OCR-D-IMG')
        #  print [str(f) for f in input_files]
        image_file = input_files[0]
        #  print(image_file)
        f = workspace.download_file(image_file)