def test_workspace_from_nothing_noclobber(tmp_path): """Attempt to re-create workspace shall fail because already created """ ws2 = Resolver().workspace_from_nothing(tmp_path) assert ws2.directory == tmp_path with pytest.raises(Exception) as exc: Resolver().workspace_from_nothing(tmp_path) # assert the_msg = "METS 'mets.xml' already exists in '%s' and clobber_mets not set" % tmp_path assert the_msg in str(exc)
def test_validate_ocrd_file(self): resolver = Resolver() workspace = resolver.workspace_from_url(assets.url_of('glyph-consistency/data/mets.xml')) with pushd_popd(workspace.directory): ocrd_file = workspace.mets.find_all_files(ID="FAULTY_GLYPHS_FILE")[0] report = PageValidator.validate(ocrd_file=ocrd_file) self.assertEqual(len([e for e in report.errors if isinstance(e, ConsistencyError)]), 17, '17 textequiv consistency errors')
def test_workspace_init_missing_mets(): """Raise Exception when missing mets-file in workspace""" with pytest.raises(Exception) as exc: Workspace(Resolver(), "foo/bar") assert "File does not exist" in str(exc.value)
def _fixture_plain_workspace(tmp_path): resolver = Resolver() ws = resolver.workspace_from_nothing(directory=tmp_path) prev_dir = abspath(curdir) chdir(tmp_path) yield ws chdir(prev_dir)
def test_handle_response_for_invalid_content(mock_get, response_dir): """If invalid content is returned, store warning log entry""" # arrange url = 'http://digital.bibliothek.uni-halle.de/hd/oai/?verb=GetRecord&metadataPrefix=mets&mode=xml&identifier=foo' mock_get.return_value.status_code = 200 mock_get.return_value.content = b'foo bar' headers = {'Content-Type': 'text/plain'} mock_get.return_value.headers = headers resolver = Resolver() initLogging() # capture log log = getLogger('ocrd_models.utils.handle_oai_response') capt = FIFOIO(256) sh = StreamHandler(capt) sh.setFormatter(Formatter(LOG_FORMAT)) log.addHandler(sh) # act resolver.download_to_directory(response_dir, url) # assert mock_get.assert_called_once_with(url) log_output = capt.getvalue() assert 'WARNING ocrd_models.utils.handle_oai_response' in log_output
def setUp(self): self.resolver = Resolver(cache_enabled=True) self.folder = os.path.join(TMP_FOLDER, 'kant_aufklaerung_1784') if os.path.exists(TMP_FOLDER): rmtree(TMP_FOLDER) os.makedirs(TMP_FOLDER) copytree(FOLDER_KANT, self.folder)
def bashlib_input_files(**kwargs): """ List input files for processing Instantiate a processor and workspace from the given processing options. Then loop through the input files of the input fileGrp, and for each one, print its `url`, `ID`, `mimetype` and `pageId`, as well as its recommended `outputFileId` (from ``make_file_id``). (The printing format is one associative array initializer per line.) """ initLogging() mets = kwargs.pop('mets') working_dir = kwargs.pop('working_dir') if is_local_filename(mets) and not isfile(get_local_filename(mets)): msg = "File does not exist: %s" % mets raise Exception(msg) resolver = Resolver() workspace = resolver.workspace_from_url(mets, working_dir) processor = Processor(workspace, ocrd_tool=None, page_id=kwargs['page_id'], input_file_grp=kwargs['input_file_grp'], output_file_grp=kwargs['output_file_grp']) for input_file in processor.input_files: for field in ['url', 'ID', 'mimetype', 'pageId']: # make this bash-friendly (show initialization for associative array) print("[%s]='%s'" % (field, getattr(input_file, field)), end=' ') print("[outputFileId]='%s'" % make_file_id(input_file, kwargs['output_file_grp']))
def test_workspace_remove_groups_unforce(workspace_directory): """Remove groups by pattern recursive""" # arrange original_data = ET.parse(os.path.join(workspace_directory, 'mets.xml')).getroot() alto_groups = original_data.findall( './/{http://www.loc.gov/METS/}fileGrp[@USE="OCR-D-GT-ALTO"]') assert len(alto_groups) == 1 altos = alto_groups[0].findall('.//{http://www.loc.gov/METS/}file') assert len(altos) == 2 # act resolver = Resolver() workspace = Workspace(resolver, workspace_directory) workspace.remove_file_group('//OCR-D-GT.*', recursive=True) workspace.save_mets() # assert written_data = ET.parse(os.path.join(workspace_directory, 'mets.xml')).getroot() assert written_data is not None groups_new = written_data.findall( './/{http://www.loc.gov/METS/}fileGrp[@USE="OCR-D-GT-ALTO"]') assert not groups_new
def _fixture_workspace_sample_features(tmp_path): copytree('tests/data/sample-features', str(tmp_path)) resolver = Resolver() ws = resolver.workspace_from_url(join(str(tmp_path), 'mets.xml')) prev_path = abspath(curdir) chdir(tmp_path) yield ws chdir(prev_path)
def _fixture_workspace_gutachten_data(tmp_path): copytree(assets.path_to('gutachten/data'), str(tmp_path)) resolver = Resolver() ws = resolver.workspace_from_url(join(str(tmp_path), 'mets.xml')) prev_path = abspath(curdir) chdir(tmp_path) yield ws chdir(prev_path)
def test_validate_ocrd_file(self): resolver = Resolver() workspace = resolver.workspace_from_url( assets.url_of('glyph-consistency/data/mets.xml')) with pushd_popd(workspace.directory): ocrd_file = workspace.mets.find_files(ID="FAULTY_GLYPHS_FILE")[0] report = PageValidator.validate(ocrd_file=ocrd_file) self.assertEqual(len(report.errors), 17, 'errors')
def test_validate_ocrd_file(self): resolver = Resolver() workspace = resolver.workspace_from_url(assets.url_of('glyph-consistency/data/mets.xml')) ocrd_file = workspace.mets.find_files(ID="FAULTY_GLYPHS_FILE")[0] if not ocrd_file.local_filename: workspace.download_file(ocrd_file) report = PageValidator.validate(ocrd_file=ocrd_file) self.assertEqual(len(report.errors), 17, 'errors')
def test_resolve_image0(): workspace = Resolver().workspace_from_url(METS_HEROLD) input_files = workspace.mets.find_all_files(fileGrp='OCR-D-IMG') f = input_files[0] img_pil1 = workspace._resolve_image_as_pil(f.url) assert img_pil1.size == (2875, 3749) img_pil2 = workspace._resolve_image_as_pil(f.url, [[0, 0], [1, 1]]) assert img_pil2.size == (1, 1)
def test_resolve_image_as_pil_deprecated(): url_path = os.path.join(assets.url_of('kant_aufklaerung_1784-binarized'), 'data/mets.xml') workspace = Resolver().workspace_from_url(url_path) with pytest.warns(DeprecationWarning) as record: workspace.resolve_image_as_pil('OCR-D-IMG-NRM/OCR-D-IMG-NRM_0017.png') # assert assert len(record) == 1 assert 'Call to deprecated method resolve_image_as_pil.' in str(record[0].message)
def test_run_cli(self): with TemporaryDirectory() as tempdir: run_cli( 'echo', mets_url=assets.url_of('SBB0000F29300010000/data/mets.xml'), resolver=Resolver(), workspace=None, page_id='page1', log_level='DEBUG', input_file_grp='INPUT', output_file_grp='OUTPUT', parameter='/path/to/param.json', working_dir=tempdir) run_cli( 'echo', mets_url=assets.url_of('SBB0000F29300010000/data/mets.xml'), resolver=Resolver(), )
def test_workspace_remove_group_not_found(workspace_directory): """Group identified by name not found raises exception""" resolver = Resolver() workspace = Workspace(resolver, workspace_directory) with pytest.raises(Exception) as exc: workspace.remove_file_group('FOO-BAR') assert "No such fileGrp" in str(exc)
def _fixture_workspace_kant_aufklaerung(tmp_path): copytree(assets.path_to('kant_aufklaerung_1784/data/'), str(tmp_path)) resolver = Resolver() ws = resolver.workspace_from_url(join(tmp_path, 'mets.xml'), src_baseurl=tmp_path) prev_dir = abspath(curdir) chdir(tmp_path) yield ws chdir(prev_dir)
def test_workspace_from_url_with_rel_dir(tmp_path): bogus_dst_dir = '../../../../../../../../../../../../../../../../%s' % str(tmp_path)[1:] # act with pushd_popd(FOLDER_KANT): ws1 = Resolver().workspace_from_url('data/mets.xml', dst_dir=bogus_dst_dir) # assert assert os.path.join(tmp_path, 'mets.xml') == ws1.mets_target assert str(tmp_path) == ws1.directory
def setUp(self): if exists(BACKUPDIR): rmtree(BACKUPDIR) self.resolver = Resolver() self.bagger = WorkspaceBagger(self.resolver) self.tempdir = mkdtemp() self.bagdir = join(self.tempdir, 'bag') copytree(assets.path_to('kant_aufklaerung_1784'), self.bagdir) self.workspace_dir = join(self.bagdir, 'data') self.workspace = Workspace(self.resolver, directory=self.workspace_dir)
def setUp(self): super().setUp() self.resolver = Resolver() self.bagger = WorkspaceBagger(self.resolver) self.tempdir = mkdtemp() self.bagdir = join(self.tempdir, 'kant_aufklaerung_1784') copytree(assets.path_to('kant_aufklaerung_1784'), self.bagdir) self.workspace_dir = join(self.bagdir, 'data') self.workspace = Workspace(self.resolver, directory=join(self.workspace_dir))
def test_param_json(self): resolver = Resolver() workspace = resolver.workspace_from_url( assets.url_of('SBB0000F29300010000/data/mets_one_file.xml'), dst_dir=WORKSPACE_DIR) run_processor(KrakenOcr, resolver=resolver, workspace=workspace, input_file_grp="INPUT", output_file_grp="OCR-D-OCR-KRAKEN") workspace.save_mets()
def test_workspace_from_url0(): # act workspace = Resolver().workspace_from_url(METS_HEROLD) input_files = workspace.mets.find_all_files(fileGrp='OCR-D-IMG') image_file = input_files[0] f = workspace.download_file(image_file) # assert assert '%s.tif' % f.ID == 'FILE_0001_IMAGE.tif' assert f.local_filename == 'OCR-D-IMG/FILE_0001_IMAGE.tif'
def test_run1(self): resolver = Resolver() workspace = resolver.workspace_from_url(assets.url_of('kant_aufklaerung_1784-binarized/data/mets.xml'), dst_dir=WORKSPACE_DIR) proc = KrakenSegment( workspace, input_file_grp="OCR-D-IMG-BIN", output_file_grp="OCR-D-SEG-LINE-KRAKEN", parameter={'level-of-operation': 'line'} ) proc.process() workspace.save_mets()