def test_check_file_grp_basic(self):
     workspace = self.resolver.workspace_from_url(
         assets.url_of('SBB0000F29300010000/data/mets.xml'))
     report = WorkspaceValidator.check_file_grp(workspace, 'foo', 'bar')
     self.assertFalse(report.is_valid)
     self.assertEqual(len(report.errors), 1)
     self.assertEqual(report.errors[0],
                      "Input fileGrp[@USE='foo'] not in METS!")
     report = WorkspaceValidator.check_file_grp(workspace, 'OCR-D-IMG',
                                                'OCR-D-IMG-BIN')
     self.assertFalse(report.is_valid)
     self.assertEqual(len(report.errors), 1)
     self.assertEqual(
         report.errors[0],
         "Output fileGrp[@USE='OCR-D-IMG-BIN'] already in METS!")
     report = WorkspaceValidator.check_file_grp(workspace, 'OCR-D-IMG,FOO',
                                                'FOO')
     self.assertFalse(report.is_valid)
     self.assertEqual(len(report.errors), 1)
     self.assertEqual(report.errors[0],
                      "Input fileGrp[@USE='FOO'] not in METS!")
     report = WorkspaceValidator.check_file_grp(workspace, 'OCR-D-IMG,FOO',
                                                None)
     self.assertFalse(report.is_valid)
     self.assertEqual(len(report.errors), 1)
     self.assertEqual(report.errors[0],
                      "Input fileGrp[@USE='FOO'] not in METS!")
     report = WorkspaceValidator.check_file_grp(workspace, None, '')
     self.assertTrue(report.is_valid)
 def test_dimensions(self):
     with TemporaryDirectory() as tempdir:
         wsdir = join(tempdir, 'foo')
         copytree(assets.path_to('kant_aufklaerung_1784/data'), wsdir)
         with pushd_popd(wsdir):
             os.system(
                 """sed -i 's,imageHeight="2083",imageHeight="1234",' OCR-D-GT-PAGE/PAGE_0017_PAGE.xml"""
             )
             report = WorkspaceValidator.validate(
                 self.resolver,
                 join(wsdir, 'mets.xml'),
                 src_dir=wsdir,
                 skip=[
                     'page', 'mets_unique_identifier',
                     'mets_file_group_names', 'mets_files', 'pixel_density',
                     'imagefilename', 'page_xsd', 'mets_xsd'
                 ],
                 download=True)
             self.assertIn(
                 "PAGE 'PAGE_0017_PAGE': @imageHeight != image's actual height (1234 != 2083)",
                 report.errors)
             #  print(report.errors)
             self.assertEqual(len(report.errors), 1)
             self.assertEqual(report.is_valid, False)
             report2 = WorkspaceValidator.validate(
                 self.resolver,
                 join(wsdir, 'mets.xml'),
                 src_dir=wsdir,
                 skip=[
                     'page', 'mets_unique_identifier',
                     'mets_file_group_names', 'mets_files', 'pixel_density',
                     'imagefilename', 'dimension', 'page_xsd', 'mets_xsd'
                 ],
                 download=False)
         self.assertEqual(report2.is_valid, True)
Пример #3
0
def validate_tasks(tasks, workspace, page_id=None, overwrite=False):
    report = ValidationReport()
    prev_output_file_grps = workspace.mets.file_groups

    first_task = tasks[0]
    first_task.validate()

    # first task: check input/output file groups from METS
    WorkspaceValidator.check_file_grp(
        workspace, first_task.input_file_grps,
        '' if overwrite else first_task.output_file_grps, page_id, report)

    prev_output_file_grps += first_task.output_file_grps
    for task in tasks[1:]:
        task.validate()
        # check either existing fileGrp or output-file group of previous task matches current input_file_group
        for input_file_grp in task.input_file_grps:
            if not input_file_grp in prev_output_file_grps:
                report.add_error(
                    "Input file group not contained in METS or produced by previous steps: %s"
                    % input_file_grp)
        if not overwrite:
            WorkspaceValidator.check_file_grp(workspace, [],
                                              task.output_file_grps, page_id,
                                              report)
        # TODO disable output_file_grps checks once CLI parameter 'overwrite' is implemented
        # XXX Thu Jan 16 20:14:17 CET 2020 still not sufficiently clever.
        #  if len(prev_output_file_grps) != len(set(prev_output_file_grps)):
        #      report.add_error("Output file group specified multiple times: %s" %
        #          [grp for grp, count in Counter(prev_output_file_grps).items() if count >= 2])
        prev_output_file_grps += task.output_file_grps
    if not report.is_valid:
        raise Exception("Invalid task sequence input/output file groups: %s" %
                        report.errors)
    return report
Пример #4
0
 def test_validate_twice(self):
     validator = WorkspaceValidator(
         self.resolver,
         assets.url_of('SBB0000F29300010000/data/mets_one_file.xml'),
         download=True)
     report = validator._validate()  # pylint: disable=protected-access
     report = validator._validate()  # pylint: disable=protected-access
     self.assertTrue(report.is_valid)
Пример #5
0
 def test_validate_empty(self):
     with TemporaryDirectory() as tempdir:
         workspace = self.resolver.workspace_from_nothing(directory=tempdir)
         report = WorkspaceValidator.validate(self.resolver, join(tempdir, 'mets.xml'))
         self.assertEqual(len(report.errors), 2)
         self.assertIn('no unique identifier', report.errors[0])
         self.assertIn('No files', report.errors[1])
         workspace.mets.unique_identifier = 'foobar'
         workspace.save_mets()
         report = WorkspaceValidator.validate(self.resolver, join(tempdir, 'mets.xml'))
         self.assertEqual(len(report.errors), 1)
Пример #6
0
 def test_src_dir(self):
     report = WorkspaceValidator.validate(
         self.resolver, None, src_dir=assets.path_to('kant_aufklaerung_1784/data'),
         skip=['imagefilename'],
         download=True,
     )
     self.assertEqual(len(report.errors), 42)
Пример #7
0
 def test_validate_weird_urls(self):
     with TemporaryDirectory() as tempdir:
         workspace = self.resolver.workspace_from_nothing(directory=tempdir)
         workspace.mets.unique_identifier = 'foobar'
         workspace.mets.add_file('OCR-D-GT-PAGE',
                                 ID='file1',
                                 mimetype='image/png',
                                 pageId='page1',
                                 url='file:/java-file-url')
         f = workspace.mets.add_file('OCR-D-GT-PAGE',
                                     ID='file2',
                                     mimetype='image/png',
                                     pageId='page2',
                                     url='nothttp://unusual.scheme')
         f._el.set('GROUPID', 'donotuse')  # pylint: disable=protected-access
         workspace.save_mets()
         report = WorkspaceValidator.validate(self.resolver,
                                              join(tempdir, 'mets.xml'),
                                              skip=['pixel_density'])
         self.assertEqual(len(report.errors), 0)
         self.assertEqual(len(report.warnings), 2)
         self.assertIn("Java-specific", report.warnings[0])
         self.assertIn("non-HTTP", report.warnings[1])
         self.assertEqual(len(report.notices), 1)
         self.assertIn("has GROUPID attribute", report.notices[0])
Пример #8
0
 def test_imagefilename(self):
     report = WorkspaceValidator.validate(
         self.resolver, None, src_dir=assets.path_to('kant_aufklaerung_1784/data'),
         skip=['page', 'mets_unique_identifier', 'mets_file_group_names', 'mets_files', 'pixel_density'],
         download=False,
     )
     self.assertEqual(len(report.errors), 0)
 def test_check_file_grp_page_id_valid(self):
     workspace = self.resolver.workspace_from_url(
         assets.url_of('SBB0000F29300010000/data/mets.xml'))
     report = WorkspaceValidator.check_file_grp(workspace,
                                                'OCR-D-IMG',
                                                'OCR-D-IMG-BIN',
                                                page_id='PHYS_0004')
     self.assertTrue(report.is_valid)
Пример #10
0
 def test_validate_files_nopageid(self):
     with TemporaryDirectory() as tempdir:
         workspace = self.resolver.workspace_from_nothing(directory=tempdir)
         workspace.mets.unique_identifier = 'foobar'
         workspace.mets.add_file('OCR-D-GT-PAGE', ID='file1', mimetype='image/png')
         workspace.save_mets()
         report = WorkspaceValidator.validate(self.resolver, join(tempdir, 'mets.xml'), skip=['pixel_density', 'imagefilename'])
         self.assertEqual(len(report.errors), 1)
         self.assertIn("does not manifest any physical page.", report.errors[0])
Пример #11
0
 def test_validate_file_groups_unspecified(self):
     with TemporaryDirectory() as tempdir:
         workspace = self.resolver.workspace_from_nothing(directory=tempdir)
         workspace.mets.unique_identifier = 'foobar'
         workspace.mets.add_file_group('OCR-D-INVALID-FILEGRP')
         workspace.save_mets()
         report = WorkspaceValidator.validate(self.resolver, join(tempdir, 'mets.xml'))
         self.assertEqual(len(report.errors), 2)
         self.assertEqual(report.errors[0], "Unspecified USE category 'INVALID' in fileGrp 'OCR-D-INVALID-FILEGRP'")
         self.assertIn('No files', report.errors[1])
Пример #12
0
 def test_validate_file_groups_bad_name(self):
     with TemporaryDirectory() as tempdir:
         workspace = self.resolver.workspace_from_nothing(directory=tempdir)
         workspace.mets.unique_identifier = 'foobar'
         workspace.mets.add_file_group('OCR-D-GT-X')
         workspace.save_mets()
         report = WorkspaceValidator.validate(self.resolver, join(tempdir, 'mets.xml'))
         self.assertEqual(len(report.errors), 2)
         self.assertIn("Invalid USE name 'X' in fileGrp", report.errors[0])
         self.assertIn('No files', report.errors[1])
 def test_check_file_grp_page_id_list(self):
     workspace = self.resolver.workspace_from_url(
         assets.url_of('SBB0000F29300010000/data/mets.xml'))
     report = WorkspaceValidator.check_file_grp(
         workspace,
         'OCR-D-IMG',
         'OCR-D-IMG-BIN',
         page_id=['PHYS_0003', 'PHYS_0001'])
     self.assertFalse(report.is_valid)
     self.assertEqual(len(report.errors), 1)
Пример #14
0
 def test_validate_file_groups_non_ocrd(self):
     with TemporaryDirectory() as tempdir:
         workspace = self.resolver.workspace_from_nothing(directory=tempdir)
         workspace.mets.unique_identifier = 'foobar'
         workspace.mets.add_file_group('FOO')
         workspace.save_mets()
         report = WorkspaceValidator.validate(self.resolver, join(tempdir, 'mets.xml'))
         self.assertEqual(len(report.errors), 1)
         self.assertIn('No files', report.errors[0])
         self.assertEqual(len(report.notices), 1)
         self.assertIn("USE does not begin with 'OCR-D-'", report.notices[0])
Пример #15
0
 def test_validate_pixel_no_download(self):
     imgpath = assets.path_to('kant_aufklaerung_1784-binarized/data/OCR-D-IMG-BIN/BIN_0020.png')
     with TemporaryDirectory() as tempdir:
         workspace = self.resolver.workspace_from_nothing(directory=tempdir)
         workspace.mets.unique_identifier = 'foobar'
         workspace.mets.add_file('OCR-D-GT-BIN', ID='file1', mimetype='image/png', pageId='page1', url=imgpath)
         workspace.save_mets()
         report = WorkspaceValidator.validate(self.resolver, join(tempdir, 'mets.xml'), skip=[], download=False)
         self.assertEqual(len(report.errors), 0)
         self.assertEqual(len(report.warnings), 0)
         self.assertEqual(len(report.notices), 0)
 def test_src_dir(self):
     report = WorkspaceValidator.validate(
         self.resolver,
         None,
         src_dir=assets.path_to('kant_aufklaerung_1784/data'),
         skip=['imagefilename'],
         download=True,
     )
     print(report.errors)
     self.assertEqual(
         len([e for e in report.errors if isinstance(e, ConsistencyError)]),
         42, '42 textequiv consistency errors')
 def test_pcgtsid(self):
     with copy_of_directory(
             assets.path_to('kant_aufklaerung_1784/data')) as wsdir:
         with pushd_popd(wsdir):
             # remove the @pcGtsId attribute for testing
             os.system(
                 """sed -i 's,pcGtsId.*,pcGtsId="foo">,' OCR-D-GT-PAGE/PAGE_0017_PAGE.xml"""
             )
             report = WorkspaceValidator.validate(self.resolver,
                                                  join(wsdir, 'mets.xml'))
             self.assertIn(
                 'pc:PcGts/@pcGtsId differs from mets:file/@ID: "foo" !== "PAGE_0017_PAGE"',
                 report.warnings)
 def test_check_file_grp_page_id_str(self):
     workspace = self.resolver.workspace_from_url(
         assets.url_of('SBB0000F29300010000/data/mets.xml'))
     report = WorkspaceValidator.check_file_grp(
         workspace,
         'OCR-D-IMG',
         'OCR-D-IMG-BIN',
         page_id='PHYS_0003,PHYS_0001')
     self.assertFalse(report.is_valid)
     self.assertEqual(len(report.errors), 1)
     self.assertEqual(
         report.errors[0],
         "Output fileGrp[@USE='OCR-D-IMG-BIN'] already contains output for page PHYS_0001"
     )
Пример #19
0
 def test_skip_page(self):
     report = WorkspaceValidator.validate(
         self.resolver,
         None,
         src_dir=assets.path_to('kant_aufklaerung_1784/data'),
         download=True,
         skip=[
             'page',
             'mets_unique_identifier',
             'mets_file_group_names',
             'mets_files',
             'pixel_density',
         ])
     self.assertTrue(report.is_valid)
 def test_validate_weird_urls(self):
     with TemporaryDirectory() as tempdir:
         workspace = self.resolver.workspace_from_nothing(directory=tempdir)
         workspace.mets.unique_identifier = 'foobar'
         workspace.mets.add_file('OCR-D-GT-PAGE',
                                 ID='file1',
                                 mimetype='image/png',
                                 pageId='page1',
                                 url='file:/java-file-url')
         f = workspace.mets.add_file('OCR-D-GT-PAGE',
                                     ID='file2',
                                     mimetype='image/png',
                                     pageId='page2',
                                     url='nothttp://unusual.scheme')
         f._el.set('GROUPID', 'donotuse')  # pylint: disable=protected-access
         workspace.save_mets()
         report = WorkspaceValidator.validate(self.resolver,
                                              join(tempdir, 'mets.xml'),
                                              skip=['pixel_density'])
         assert not report.is_valid
         assert len(report.errors) == 2
         assert "invalid (Java-specific) file URL" in report.errors[0]
Пример #21
0
def ocrd_cli_wrap_processor(processorClass,
                            ocrd_tool=None,
                            mets=None,
                            working_dir=None,
                            dump_json=False,
                            help=False,
                            version=False,
                            **kwargs):
    LOG = getLogger('ocrd_cli_wrap_processor')
    if dump_json:
        processorClass(workspace=None, dump_json=True)
    elif help:
        processorClass(workspace=None, show_help=True)
    elif version:
        processorClass(workspace=None, show_version=True)
    elif mets is None:
        msg = 'Error: Missing option "-m" / "--mets".'
        LOG.error(msg)
        raise Exception(msg)
    else:
        if is_local_filename(mets) and not isfile(get_local_filename(mets)):
            msg = "File does not exist: %s" % mets
            LOG.error(msg)
            raise Exception(msg)
        resolver = Resolver()
        workspace = resolver.workspace_from_url(mets, working_dir)
        # TODO once we implement 'overwrite' CLI option and mechanism, disable the
        # `output_file_grp_ check by setting to False-y value if 'overwrite' is set
        report = WorkspaceValidator.check_file_grp(workspace,
                                                   kwargs['input_file_grp'],
                                                   kwargs['output_file_grp'])
        if not report.is_valid:
            raise Exception("Invalid input/output file grps:\n\t%s" %
                            '\n\t'.join(report.errors))
        run_processor(processorClass,
                      ocrd_tool,
                      mets,
                      workspace=workspace,
                      **kwargs)
Пример #22
0
def validate_many(bagsdir, report_dir):
    """
    Update many OCR-D bags at once

    BAGSDIR must contain only directories thaty contain unserialized OCRD-ZIP
    """
    # yes, that is bagsdir, bagdirs and bagdir. Deal with it 😎 🆒
    bagsdir = Path(bagsdir)
    bagdirs = [
        x for x in bagsdir.iterdir()
        if x.is_dir() and not x.name.startswith('.')
    ]
    total = len(bagdirs)
    cur = 0

    report_dir = Path(report_dir)
    report_dir.mkdir(parents=True, exist_ok=True)
    for bagdir in bagdirs:
        directory = Path(bagdir, 'data')
        cur += 1
        LOG.info(">>>>> OCR-D-ZIP [%05d / %05d] %s", cur, total, bagdir.name)
        report = WorkspaceValidator.validate(resolver,
                                             str(Path(directory, 'mets.xml')),
                                             src_dir=directory,
                                             skip=[],
                                             download=False,
                                             page_strictness='lax')
        Path(report_dir,
             '%s.workspace.txt' % bagdir.name).write_text(report.to_xml())

        try:
            report = OcrdZipValidator(resolver,
                                      str(bagdir)).validate(skip_unzip=True)
            Path(report_dir,
                 '%s.ocrd-zip.txt' % bagdir.name).write_text(report.to_xml())
        except Exception as e:
            Path(report_dir,
                 '%s.ocrd-zip.txt' % bagdir.name).write_text(str(e))
Пример #23
0
def ocrd_cli_wrap_processor(
        processorClass,
        ocrd_tool=None,
        mets=None,
        working_dir=None,
        dump_json=False,
        help=False,  # pylint: disable=redefined-builtin
        version=False,
        overwrite=False,
        **kwargs):
    if not sys.argv[1:]:
        processorClass(workspace=None, show_help=True)
        sys.exit(1)
    if dump_json or help or version:
        processorClass(workspace=None,
                       dump_json=dump_json,
                       show_help=help,
                       show_version=version)
        sys.exit()
    else:
        initLogging()
        LOG = getLogger('ocrd_cli_wrap_processor')
        # LOG.info('kwargs=%s' % kwargs)
        # Merge parameter overrides and parameters
        if 'parameter_override' in kwargs:
            set_json_key_value_overrides(kwargs['parameter'],
                                         *kwargs['parameter_override'])
        # TODO OCR-D/core#274
        # Assert -I / -O
        # if not kwargs['input_file_grp']:
        #     raise ValueError('-I/--input-file-grp is required')
        # if not kwargs['output_file_grp']:
        #     raise ValueError('-O/--output-file-grp is required')
        if is_local_filename(mets) and not isfile(get_local_filename(mets)):
            msg = "File does not exist: %s" % mets
            LOG.error(msg)
            raise Exception(msg)
        resolver = Resolver()
        workspace = resolver.workspace_from_url(mets, working_dir)
        page_id = kwargs.get('page_id')
        # XXX not possible while processors do not adhere to # https://github.com/OCR-D/core/issues/505
        # if overwrite
        #     if 'output_file_grp' not in kwargs or not kwargs['output_file_grp']:
        #         raise Exception("--overwrite requires --output-file-grp")
        #     LOG.info("Removing files because of --overwrite")
        #     for grp in kwargs['output_file_grp'].split(','):
        #         if page_id:
        #             for one_page_id in kwargs['page_id'].split(','):
        #                 LOG.debug("Removing files in output file group %s with page ID %s", grp, one_page_id)
        #                 for file in workspace.mets.find_files(pageId=one_page_id, fileGrp=grp):
        #                     workspace.remove_file(file, force=True, keep_file=False, page_recursive=True)
        #         else:
        #             LOG.debug("Removing all files in output file group %s ", grp)
        #             # TODO: can be reduced to `page_same_group=True` as soon as core#505 has landed (in all processors)
        #             workspace.remove_file_group(grp, recursive=True, force=True, keep_files=False, page_recursive=True, page_same_group=False)
        #     workspace.save_mets()
        # XXX While https://github.com/OCR-D/core/issues/505 is open, set 'overwrite_mode' globally on the workspace
        if overwrite:
            workspace.overwrite_mode = True
        report = WorkspaceValidator.check_file_grp(
            workspace, kwargs['input_file_grp'],
            '' if overwrite else kwargs['output_file_grp'], page_id)
        if not report.is_valid:
            raise Exception("Invalid input/output file grps:\n\t%s" %
                            '\n\t'.join(report.errors))
        run_processor(processorClass,
                      ocrd_tool,
                      mets,
                      workspace=workspace,
                      **kwargs)
 def test_simple(self):
     report = WorkspaceValidator.validate(
         self.resolver,
         assets.url_of('SBB0000F29300010000/data/mets_one_file.xml'),
         download=True)
     self.assertTrue(report.is_valid)
 def test_bad_workspace(self):
     report = WorkspaceValidator.validate(self.resolver, 'non existe')
     self.assertFalse(report.is_valid)
     self.assertIn('Failed to instantiate workspace:', report.errors[0])