Пример #1
0
 def test_merge(self):
     with copy_of_directory(assets.path_to('kant_aufklaerung_1784/data')) as ws1dir, \
         copy_of_directory(assets.path_to('SBB0000F29300010000/data')) as ws2dir:
         ws1 = Workspace(self.resolver, ws1dir)
         ws2 = Workspace(self.resolver, ws2dir)
         assert len(ws1.mets.find_all_files()) == 6
         ws1.merge(ws2)
         assert len(ws1.mets.find_all_files()) == 41
         assert exists(join(ws1dir, 'OCR-D-IMG/FILE_0001_IMAGE.tif'))
Пример #2
0
 def test_find_all_files_multiple_physical_pages_for_fileids(self):
     with copy_of_directory(assets.path_to('SBB0000F29300010000/data')) as tempdir:
         result = self.runner.invoke(workspace_cli, ['-d', tempdir, 'find', '--page-id', 'PHYS_0005,PHYS_0005', '-k', 'url'])
         self.assertEqual(result.stdout, 'OCR-D-IMG/FILE_0005_IMAGE.tif\n')
         self.assertEqual(result.exit_code, 0)
         result = self.runner.invoke(workspace_cli, ['-d', tempdir, 'find', '--page-id', 'PHYS_0005,PHYS_0001', '-k', 'url'])
         self.assertEqual(len(result.stdout.split('\n')), 19)
Пример #3
0
 def test_download_to_directory_default(self):
     with copy_of_directory(FOLDER_KANT) as src:
         with TemporaryDirectory() as dst:
             fn = self.resolver.download_to_directory(
                 dst, pjoin(src, 'data/mets.xml'))
             self.assertEqual(fn, 'mets.xml')
             self.assertTrue(Path(dst, fn).exists())
Пример #4
0
 def test_copies_ok(self):
     with copy_of_directory(assets.url_of('SBB0000F29300010000/data')) as wsdir:
         workspace = Workspace(Resolver(), wsdir)
         input_files = workspace.mets.find_files(fileGrp='OCR-D-IMG')
         self.assertEqual(len(input_files), 3)
         output_files = workspace.mets.find_files(fileGrp='OUTPUT')
         self.assertEqual(len(output_files), 0)
         run_processor(
             DummyProcessor,
             input_file_grp='OCR-D-IMG',
             output_file_grp='OUTPUT',
             workspace=workspace
         )
         output_files = workspace.mets.find_files(fileGrp='OUTPUT')
         output_files.sort(key=lambda x: x.url)
         print([str(s) for s in output_files])
         self.assertEqual(output_files[0].url, 'OUTPUT/OUTPUT_0001.tif')
         self.assertEqual(output_files[1].url, 'OUTPUT/OUTPUT_0001.xml')
         self.assertEqual(page_from_file(output_files[1]).pcGtsId, output_files[1].ID)
         self.assertEqual(page_from_file(output_files[1]).get_Page().imageFilename, output_files[0].url)
         self.assertEqual(len(output_files), 6)
         self.assertEqual(len(workspace.mets.find_files(ID='//OUTPUT.*')), 6)
         self.assertEqual(len(workspace.mets.find_files(ID='//OUTPUT.*_PAGE')), 3)
         self.assertEqual(len(workspace.mets.find_files(fileGrp='OUTPUT', mimetype=MIMETYPE_PAGE)), 3)
         run_processor(
             DummyProcessor,
             input_file_grp='OUTPUT',
             output_file_grp='OUTPUT2',
             workspace=workspace
         )
         output2_files = workspace.mets.find_files(fileGrp='OUTPUT2')
         output2_files.sort(key=lambda x: x.url)
         self.assertEqual(len(output2_files), 3)
Пример #5
0
 def test_remove_file_ocrdfile(self):
     with copy_of_directory(assets.path_to('SBB0000F29300010000/data')) as tempdir:
         mets = OcrdMets(filename=join(tempdir, 'mets.xml'))
         self.assertEqual(mets.physical_pages, ['PHYS_0001', 'PHYS_0002', 'PHYS_0005'])
         ocrd_file = mets.find_all_files(ID='FILE_0005_IMAGE')[0]
         mets.remove_one_file(ocrd_file)
         self.assertEqual(mets.physical_pages, ['PHYS_0001', 'PHYS_0002'])
Пример #6
0
 def test_physical_pages_for_fileids(self):
     with copy_of_directory(
             assets.path_to('SBB0000F29300010000/data')) as tempdir:
         mets = OcrdMets(filename=join(tempdir, 'mets.xml'))
         self.assertEqual(
             mets.get_physical_pages(for_fileIds=['FILE_0002_IMAGE']),
             ['PHYS_0002'])
Пример #7
0
 def test_remove_file_group_rmdir(self):
     with copy_of_directory(
             assets.path_to('SBB0000F29300010000/data')) as tempdir:
         workspace = Workspace(self.resolver, directory=tempdir)
         self.assertTrue(exists(join(tempdir, 'OCR-D-IMG')))
         workspace.remove_file_group('OCR-D-IMG', recursive=True)
         self.assertFalse(exists(join(tempdir, 'OCR-D-IMG')))
Пример #8
0
 def test_remove_file_page_recursive_same_group(self):
     with copy_of_directory(assets.path_to('kant_aufklaerung_1784-complex/data')) as tempdir:
         with pushd_popd(tempdir):
             ws = Workspace(self.resolver, directory=tempdir)
             before = count_files()
             ws.remove_file('OCR-D-IMG-BINPAGE-sauvola_0001', page_recursive=True, page_same_group=True, force=False)
             after = count_files()
             self.assertEqual(after, before - 1, '2 file deleted')
Пример #9
0
 def test_remove_file_page_recursive(self):
     with copy_of_directory(assets.path_to('kant_aufklaerung_1784-complex/data')) as tempdir:
         with pushd_popd(tempdir):
             ws = Workspace(self.resolver, directory=tempdir)
             self.assertEqual(len(ws.mets.find_files()), 119)
             ws.remove_file('OCR-D-OCR-OCRO-fraktur-SEG-LINE-tesseract-ocropy-DEWARP_0001', page_recursive=True, page_same_group=False, keep_file=True)
             self.assertEqual(len(ws.mets.find_files()), 83)
             ws.remove_file('PAGE_0017_ALTO', page_recursive=True)
Пример #10
0
 def test_cli_process_smoke(self):
     disableLogging()
     with copy_of_directory(
             assets.path_to('kant_aufklaerung_1784/data')) as wsdir:
         with pushd_popd(wsdir):
             with self.assertRaisesRegex(
                     Exception, "Executable not found in PATH: ocrd-foo"):
                 self.invoke_cli(process_cli, ['foo'])
Пример #11
0
 def test_remove_file_regex(self):
     with copy_of_directory(
             assets.path_to('SBB0000F29300010000/data')) as tempdir:
         mets = OcrdMets(filename=join(tempdir, 'mets.xml'))
         self.assertEqual(mets.physical_pages,
                          ['PHYS_0001', 'PHYS_0002', 'PHYS_0005'])
         mets.remove_file('//FILE_0005.*')
         self.assertEqual(mets.physical_pages, ['PHYS_0001', 'PHYS_0002'])
Пример #12
0
 def test_parameter_override_wo_param(self):
     with copy_of_directory(
             assets.path_to('SBB0000F29300010000/data')) as tempdir:
         with pushd_popd(tempdir):
             code, out, err = self.invoke_cli(
                 cli_dummy_processor, ['-P', 'baz', 'two', *DEFAULT_IN_OUT])
             print(out)
             self.assertEqual(out, '{"baz": "two"}\n')
Пример #13
0
 def test_processor_run(self):
     with copy_of_directory(
             assets.path_to('SBB0000F29300010000/data')) as tempdir:
         with pushd_popd(tempdir):
             result = self.runner.invoke(
                 cli_dummy_processor,
                 ['-p', '{"foo": 42}', '--mets', 'mets.xml'])
             self.assertEqual(result.exit_code, 0)
Пример #14
0
 def test_remove_file_group_force(self):
     with copy_of_directory(
             assets.path_to('SBB0000F29300010000/data')) as tempdir:
         workspace = Workspace(self.resolver, directory=tempdir)
         with self.assertRaisesRegex(Exception, "No such fileGrp"):
             # raise error unless force
             workspace.remove_file_group('I DO NOT EXIST')
         # no error
         workspace.remove_file_group('I DO NOT EXIST', force=True)
Пример #15
0
 def test_remove_file_force(self):
     with copy_of_directory(
             assets.path_to('SBB0000F29300010000/data')) as tempdir:
         workspace = Workspace(self.resolver, directory=tempdir)
         with self.assertRaisesRegex(FileNotFoundError, "not found"):
             # should fail
             workspace.remove_file('non-existing-id')
         # should succeed
         workspace.remove_file('non-existing-id', force=True)
Пример #16
0
 def test_rename_file_group0(self):
     with copy_of_directory(assets.path_to('SBB0000F29300010000/data')) as tempdir:
         mets = OcrdMets(filename=join(tempdir, 'mets.xml'))
         with self.assertRaisesRegex(FileNotFoundError, "No such fileGrp 'FOOBAR'"):
             mets.rename_file_group('FOOBAR', 'FOOBAR')
         assert 'FOOBAR' not in mets.file_groups
         mets.rename_file_group('OCR-D-GT-PAGE', 'FOOBAR')
         assert 'OCR-D-GT-PAGE' not in mets.file_groups
         assert 'FOOBAR' in mets.file_groups
Пример #17
0
 def test_processor_run(self):
     with copy_of_directory(
             assets.path_to('SBB0000F29300010000/data')) as tempdir:
         with pushd_popd(tempdir):
             exit_code, out, err = self.invoke_cli(cli_dummy_processor, [
                 '-p', '{"baz": "forty-two"}', '--mets', 'mets.xml',
                 *DEFAULT_IN_OUT
             ])
             assert not exit_code
Пример #18
0
 def test_remove_file_group_regex(self):
     """
     Test removal of filegrp
     """
     with copy_of_directory(assets.path_to('SBB0000F29300010000/data')) as tempdir:
         mets = OcrdMets(filename=join(tempdir, 'mets.xml'))
         self.assertEqual(len(mets.file_groups), 17)
         self.assertEqual(len(mets.find_all_files()), 35)
         mets.remove_file_group('//OCR-D-GT-.*', recursive=True)
         self.assertEqual(len(mets.file_groups), 15)
         self.assertEqual(len(mets.find_all_files()), 31)
Пример #19
0
 def test_remove_file_group_force(self):
     with copy_of_directory(assets.path_to('SBB0000F29300010000/data')) as tempdir:
         workspace = Workspace(self.resolver, directory=tempdir)
         with self.assertRaisesRegex(Exception, "No such fileGrp"):
             # should fail
             workspace.remove_file_group('I DO NOT EXIST')
         # should succeed
         workspace.remove_file_group('I DO NOT EXIST', force=True)
         # should also succeed
         workspace.overwrite_mode = True
         workspace.remove_file_group('I DO NOT EXIST', force=False)
 def test_pcgtsid(self):
     with copy_of_directory(
             assets.path_to('kant_aufklaerung_1784/data')) as wsdir:
         with pushd_popd(wsdir):
             # remove the @pcGtsId attribute for testing
             os.system(
                 """sed -i 's,pcGtsId.*,pcGtsId="foo">,' OCR-D-GT-PAGE/PAGE_0017_PAGE.xml"""
             )
             report = WorkspaceValidator.validate(self.resolver,
                                                  join(wsdir, 'mets.xml'))
             self.assertIn(
                 'pc:PcGts/@pcGtsId differs from mets:file/@ID: "foo" !== "PAGE_0017_PAGE"',
                 report.warnings)
Пример #21
0
 def test_rename_file_group(self):
     with copy_of_directory(assets.path_to('kant_aufklaerung_1784-page-region-line-word_glyph/data')) as tempdir:
         workspace = Workspace(self.resolver, directory=tempdir)
         with pushd_popd(tempdir):
             pcgts_before = page_from_file(next(workspace.mets.find_files(ID='OCR-D-GT-SEG-WORD_0001')))
             assert pcgts_before.get_Page().imageFilename == 'OCR-D-IMG/OCR-D-IMG_0001.tif'
             # from os import system
             # print(system('find'))
             workspace.rename_file_group('OCR-D-IMG', 'FOOBAR')
             # print(system('find'))
             pcgts_after = page_from_file(next(workspace.mets.find_files(ID='OCR-D-GT-SEG-WORD_0001')))
             assert pcgts_after.get_Page().imageFilename == 'FOOBAR/OCR-D-IMG_0001.tif'
             assert Path('FOOBAR/OCR-D-IMG_0001.tif').exists()
             assert not Path('OCR-D-IMG/OCR-D-IMG_0001.tif').exists()
Пример #22
0
 def test_crop(self):
     if not torch.cuda.is_available():
         pytest.skip('CUDA is not available, cannot test dewarping')
     with copy_of_directory(assets.path_to('dfki-testdata/data')) as wsdir:
         ws = Workspace(self.resolver, wsdir)
         pagexml_before = len(ws.mets.find_files(mimetype=MIMETYPE_PAGE))
         run_processor(OcrdAnybaseocrDewarper,
                       resolver=self.resolver,
                       mets_url=str(Path(wsdir, 'mets.xml')),
                       input_file_grp='BIN',
                       output_file_grp='DEWARP-TEST',
                       parameter={'model_path': str(self.model_path)})
         ws.reload_mets()
         pagexml_after = len(ws.mets.find_files(mimetype=MIMETYPE_PAGE))
         self.assertEqual(pagexml_after, pagexml_before + 1)
Пример #23
0
 def test_crop(self):
     with copy_of_directory(assets.path_to('dfki-testdata/data')) as wsdir:
         ws = Workspace(self.resolver, wsdir)
         pagexml_before = len(ws.mets.find_files(mimetype=MIMETYPE_PAGE))
         run_processor(
             OcrdAnybaseocrCropper,
             resolver=self.resolver,
             mets_url=str(Path(wsdir, 'mets.xml')),
             input_file_grp='BIN',
             output_file_grp='CROP-TEST',
             parameter={},
         )
         ws.reload_mets()
         pagexml_after = len(ws.mets.find_files(mimetype=MIMETYPE_PAGE))
         self.assertEqual(pagexml_after, pagexml_before + 1)
Пример #24
0
 def test_remove_file_group0(self):
     """
     Test removal of filegrp
     """
     with copy_of_directory(assets.path_to('SBB0000F29300010000/data')) as tempdir:
         mets = OcrdMets(filename=join(tempdir, 'mets.xml'))
         self.assertEqual(len(mets.file_groups), 17)
         self.assertEqual(len(mets.find_all_files()), 35)
         #  print()
         #  before = sorted([x.ID for x in mets.find_all_files()])
         with self.assertRaisesRegex(Exception, "not empty"):
             mets.remove_file_group('OCR-D-GT-ALTO')
         mets.remove_file_group('OCR-D-GT-PAGE', recursive=True)
         #  print([x for x in before if x not in sorted([x.ID for x in mets.find_all_files()])])
         self.assertEqual(len(mets.file_groups), 16)
         self.assertEqual(len(mets.find_all_files()), 33)
Пример #25
0
    def test_copy_vs_clone(self):
        src_dir = assets.path_to('kant_aufklaerung_1784/data')
        with TemporaryDirectory() as tempdir:
            # cloned without download
            shallowcloneddir = join(tempdir, 'cloned-shallow')
            # cloned with download
            fullcloneddir = join(tempdir, 'cloned-all')
            # copied
            copieddir = join(tempdir, 'copied')

            Path(fullcloneddir).mkdir()
            Path(shallowcloneddir).mkdir()

            result = self.runner.invoke(
                workspace_cli,
                ['clone', join(src_dir, 'mets.xml'), shallowcloneddir])
            self.assertEqual(result.exit_code, 0)

            result = self.runner.invoke(
                workspace_cli,
                ['clone', '-a',
                 join(src_dir, 'mets.xml'), fullcloneddir])
            self.assertEqual(result.exit_code, 0)

            with copy_of_directory(src_dir, copieddir):
                shallow_vs_copied = dircmp(shallowcloneddir, copieddir)
                self.assertEqual(
                    set(shallow_vs_copied.right_only),
                    set(['OCR-D-GT-ALTO', 'OCR-D-GT-PAGE', 'OCR-D-IMG']))

                full_vs_copied = dircmp(fullcloneddir, copieddir)
                #  print(full_vs_copied)
                #  from ocrd_utils import pushd_popd
                #  with pushd_popd(tempdir):
                #  import os
                #  os.system("diff %s/mets.xml %s/mets.xml" % (fullcloneddir, copieddir))
                # XXX mets.xml will not have the exact same content because
                # URLs that are actually files will be marked up as such with
                # @LOCTYPE/@OTHERLOCTYPE
                #  self.assertEqual(full_vs_copied.diff_files, [])
                self.assertEqual(full_vs_copied.left_only, [])
                self.assertEqual(full_vs_copied.right_only, [])
Пример #26
0
 def test_task_run(self):
     resolver = Resolver()
     with copy_of_directory(
             assets.path_to('kant_aufklaerung_1784/data')) as wsdir:
         with pushd_popd(wsdir):
             ws = resolver.workspace_from_url('mets.xml')
             ws.add_file('GRP0',
                         content='',
                         local_filename='GRP0/foo',
                         ID='file0',
                         mimetype=MIMETYPE_PAGE,
                         pageId=None)
             ws.save_mets()
             files_before = len(ws.mets.find_files())
             run_tasks('mets.xml', 'DEBUG', None, [
                 "dummy -I OCR-D-IMG -O GRP1",
                 "dummy -I GRP1 -O GRP2",
             ])
             ws.reload_mets()
             # step 1: 2 images in OCR-D-IMG -> 2 images 2 PAGEXML in GRP1
             # step 2: 2 images and 2 PAGEXML in GRP1 -> process just the PAGEXML
             self.assertEqual(len(ws.mets.find_files()), files_before + 6)