예제 #1
0
def test_workspace_from_nothing_noclobber(tmp_path):
    """Attempt to re-create workspace shall fail because already created
    """

    ws2 = Resolver().workspace_from_nothing(tmp_path)
    assert ws2.directory == tmp_path

    with pytest.raises(Exception) as exc:
        Resolver().workspace_from_nothing(tmp_path)

    # assert
    the_msg = "METS 'mets.xml' already exists in '%s' and clobber_mets not set" % tmp_path
    assert the_msg in str(exc)
 def test_validate_ocrd_file(self):
     resolver = Resolver()
     workspace = resolver.workspace_from_url(assets.url_of('glyph-consistency/data/mets.xml'))
     with pushd_popd(workspace.directory):
         ocrd_file = workspace.mets.find_all_files(ID="FAULTY_GLYPHS_FILE")[0]
         report = PageValidator.validate(ocrd_file=ocrd_file)
         self.assertEqual(len([e for e in report.errors if isinstance(e, ConsistencyError)]), 17, '17 textequiv consistency errors')
예제 #3
0
def test_workspace_init_missing_mets():
    """Raise Exception when missing mets-file in workspace"""

    with pytest.raises(Exception) as exc:
        Workspace(Resolver(), "foo/bar")

    assert "File does not exist" in str(exc.value)
예제 #4
0
def _fixture_plain_workspace(tmp_path):
    resolver = Resolver()
    ws = resolver.workspace_from_nothing(directory=tmp_path)
    prev_dir = abspath(curdir)
    chdir(tmp_path)
    yield ws
    chdir(prev_dir)
예제 #5
0
def test_handle_response_for_invalid_content(mock_get, response_dir):
    """If invalid content is returned, store warning log entry"""

    # arrange
    url = 'http://digital.bibliothek.uni-halle.de/hd/oai/?verb=GetRecord&metadataPrefix=mets&mode=xml&identifier=foo'
    mock_get.return_value.status_code = 200
    mock_get.return_value.content = b'foo bar'
    headers = {'Content-Type': 'text/plain'}
    mock_get.return_value.headers = headers
    resolver = Resolver()
    initLogging()

    # capture log
    log = getLogger('ocrd_models.utils.handle_oai_response')
    capt = FIFOIO(256)
    sh = StreamHandler(capt)
    sh.setFormatter(Formatter(LOG_FORMAT))
    log.addHandler(sh)

    # act
    resolver.download_to_directory(response_dir, url)

    # assert
    mock_get.assert_called_once_with(url)
    log_output = capt.getvalue()
    assert 'WARNING ocrd_models.utils.handle_oai_response' in log_output
예제 #6
0
 def setUp(self):
     self.resolver = Resolver(cache_enabled=True)
     self.folder = os.path.join(TMP_FOLDER, 'kant_aufklaerung_1784')
     if os.path.exists(TMP_FOLDER):
         rmtree(TMP_FOLDER)
         os.makedirs(TMP_FOLDER)
     copytree(FOLDER_KANT, self.folder)
예제 #7
0
def bashlib_input_files(**kwargs):
    """
    List input files for processing

    Instantiate a processor and workspace from the given processing options.
    Then loop through the input files of the input fileGrp, and for each one,
    print its `url`, `ID`, `mimetype` and `pageId`, as well as its recommended
    `outputFileId` (from ``make_file_id``).

    (The printing format is one associative array initializer per line.)
    """
    initLogging()
    mets = kwargs.pop('mets')
    working_dir = kwargs.pop('working_dir')
    if is_local_filename(mets) and not isfile(get_local_filename(mets)):
        msg = "File does not exist: %s" % mets
        raise Exception(msg)
    resolver = Resolver()
    workspace = resolver.workspace_from_url(mets, working_dir)
    processor = Processor(workspace,
                          ocrd_tool=None,
                          page_id=kwargs['page_id'],
                          input_file_grp=kwargs['input_file_grp'],
                          output_file_grp=kwargs['output_file_grp'])
    for input_file in processor.input_files:
        for field in ['url', 'ID', 'mimetype', 'pageId']:
            # make this bash-friendly (show initialization for associative array)
            print("[%s]='%s'" % (field, getattr(input_file, field)), end=' ')
        print("[outputFileId]='%s'" %
              make_file_id(input_file, kwargs['output_file_grp']))
def test_workspace_remove_groups_unforce(workspace_directory):
    """Remove groups by pattern recursive"""

    # arrange
    original_data = ET.parse(os.path.join(workspace_directory,
                                          'mets.xml')).getroot()
    alto_groups = original_data.findall(
        './/{http://www.loc.gov/METS/}fileGrp[@USE="OCR-D-GT-ALTO"]')
    assert len(alto_groups) == 1
    altos = alto_groups[0].findall('.//{http://www.loc.gov/METS/}file')
    assert len(altos) == 2

    # act
    resolver = Resolver()
    workspace = Workspace(resolver, workspace_directory)
    workspace.remove_file_group('//OCR-D-GT.*', recursive=True)
    workspace.save_mets()

    # assert
    written_data = ET.parse(os.path.join(workspace_directory,
                                         'mets.xml')).getroot()
    assert written_data is not None
    groups_new = written_data.findall(
        './/{http://www.loc.gov/METS/}fileGrp[@USE="OCR-D-GT-ALTO"]')
    assert not groups_new
예제 #9
0
def _fixture_workspace_sample_features(tmp_path):
    copytree('tests/data/sample-features', str(tmp_path))
    resolver = Resolver()
    ws = resolver.workspace_from_url(join(str(tmp_path), 'mets.xml'))
    prev_path = abspath(curdir)
    chdir(tmp_path)
    yield ws
    chdir(prev_path)
예제 #10
0
def _fixture_workspace_gutachten_data(tmp_path):
    copytree(assets.path_to('gutachten/data'), str(tmp_path))
    resolver = Resolver()
    ws = resolver.workspace_from_url(join(str(tmp_path), 'mets.xml'))
    prev_path = abspath(curdir)
    chdir(tmp_path)
    yield ws
    chdir(prev_path)
예제 #11
0
 def test_validate_ocrd_file(self):
     resolver = Resolver()
     workspace = resolver.workspace_from_url(
         assets.url_of('glyph-consistency/data/mets.xml'))
     with pushd_popd(workspace.directory):
         ocrd_file = workspace.mets.find_files(ID="FAULTY_GLYPHS_FILE")[0]
         report = PageValidator.validate(ocrd_file=ocrd_file)
         self.assertEqual(len(report.errors), 17, 'errors')
예제 #12
0
 def test_validate_ocrd_file(self):
     resolver = Resolver()
     workspace = resolver.workspace_from_url(assets.url_of('glyph-consistency/data/mets.xml'))
     ocrd_file = workspace.mets.find_files(ID="FAULTY_GLYPHS_FILE")[0]
     if not ocrd_file.local_filename:
         workspace.download_file(ocrd_file)
     report = PageValidator.validate(ocrd_file=ocrd_file)
     self.assertEqual(len(report.errors), 17, 'errors')
예제 #13
0
def test_resolve_image0():
    workspace = Resolver().workspace_from_url(METS_HEROLD)
    input_files = workspace.mets.find_all_files(fileGrp='OCR-D-IMG')
    f = input_files[0]
    img_pil1 = workspace._resolve_image_as_pil(f.url)
    assert img_pil1.size == (2875, 3749)
    img_pil2 = workspace._resolve_image_as_pil(f.url, [[0, 0], [1, 1]])
    assert img_pil2.size == (1, 1)
예제 #14
0
def test_resolve_image_as_pil_deprecated():
    url_path = os.path.join(assets.url_of('kant_aufklaerung_1784-binarized'), 'data/mets.xml')
    workspace = Resolver().workspace_from_url(url_path)
    with pytest.warns(DeprecationWarning) as record:
        workspace.resolve_image_as_pil('OCR-D-IMG-NRM/OCR-D-IMG-NRM_0017.png')

    # assert
    assert len(record) == 1
    assert 'Call to deprecated method resolve_image_as_pil.' in str(record[0].message)
예제 #15
0
 def test_run_cli(self):
     with TemporaryDirectory() as tempdir:
         run_cli(
             'echo',
             mets_url=assets.url_of('SBB0000F29300010000/data/mets.xml'),
             resolver=Resolver(),
             workspace=None,
             page_id='page1',
             log_level='DEBUG',
             input_file_grp='INPUT',
             output_file_grp='OUTPUT',
             parameter='/path/to/param.json',
             working_dir=tempdir)
         run_cli(
             'echo',
             mets_url=assets.url_of('SBB0000F29300010000/data/mets.xml'),
             resolver=Resolver(),
         )
예제 #16
0
def test_workspace_remove_group_not_found(workspace_directory):
    """Group identified by name not found raises exception"""

    resolver = Resolver()
    workspace = Workspace(resolver, workspace_directory)
    with pytest.raises(Exception) as exc:
        workspace.remove_file_group('FOO-BAR')

    assert "No such fileGrp" in str(exc)
예제 #17
0
def _fixture_workspace_kant_aufklaerung(tmp_path):
    copytree(assets.path_to('kant_aufklaerung_1784/data/'), str(tmp_path))
    resolver = Resolver()
    ws = resolver.workspace_from_url(join(tmp_path, 'mets.xml'),
                                     src_baseurl=tmp_path)
    prev_dir = abspath(curdir)
    chdir(tmp_path)
    yield ws
    chdir(prev_dir)
예제 #18
0
def test_workspace_from_url_with_rel_dir(tmp_path):
    bogus_dst_dir = '../../../../../../../../../../../../../../../../%s' % str(tmp_path)[1:]

    # act
    with pushd_popd(FOLDER_KANT):
        ws1 = Resolver().workspace_from_url('data/mets.xml', dst_dir=bogus_dst_dir)

    # assert
    assert os.path.join(tmp_path, 'mets.xml') == ws1.mets_target
    assert str(tmp_path) == ws1.directory
예제 #19
0
 def setUp(self):
     if exists(BACKUPDIR):
         rmtree(BACKUPDIR)
     self.resolver = Resolver()
     self.bagger = WorkspaceBagger(self.resolver)
     self.tempdir = mkdtemp()
     self.bagdir = join(self.tempdir, 'bag')
     copytree(assets.path_to('kant_aufklaerung_1784'), self.bagdir)
     self.workspace_dir = join(self.bagdir, 'data')
     self.workspace = Workspace(self.resolver, directory=self.workspace_dir)
예제 #20
0
 def setUp(self):
     super().setUp()
     self.resolver = Resolver()
     self.bagger = WorkspaceBagger(self.resolver)
     self.tempdir = mkdtemp()
     self.bagdir = join(self.tempdir, 'kant_aufklaerung_1784')
     copytree(assets.path_to('kant_aufklaerung_1784'), self.bagdir)
     self.workspace_dir = join(self.bagdir, 'data')
     self.workspace = Workspace(self.resolver,
                                directory=join(self.workspace_dir))
예제 #21
0
 def test_param_json(self):
     resolver = Resolver()
     workspace = resolver.workspace_from_url(
         assets.url_of('SBB0000F29300010000/data/mets_one_file.xml'),
         dst_dir=WORKSPACE_DIR)
     run_processor(KrakenOcr,
                   resolver=resolver,
                   workspace=workspace,
                   input_file_grp="INPUT",
                   output_file_grp="OCR-D-OCR-KRAKEN")
     workspace.save_mets()
예제 #22
0
def test_workspace_from_url0():

    # act
    workspace = Resolver().workspace_from_url(METS_HEROLD)
    input_files = workspace.mets.find_all_files(fileGrp='OCR-D-IMG')
    image_file = input_files[0]
    f = workspace.download_file(image_file)

    # assert
    assert '%s.tif' % f.ID == 'FILE_0001_IMAGE.tif'
    assert f.local_filename == 'OCR-D-IMG/FILE_0001_IMAGE.tif'
예제 #23
0
 def test_run1(self):
     resolver = Resolver()
     workspace = resolver.workspace_from_url(assets.url_of('kant_aufklaerung_1784-binarized/data/mets.xml'), dst_dir=WORKSPACE_DIR)
     proc = KrakenSegment(
         workspace,
         input_file_grp="OCR-D-IMG-BIN",
         output_file_grp="OCR-D-SEG-LINE-KRAKEN",
         parameter={'level-of-operation': 'line'}
     )
     proc.process()
     workspace.save_mets()