def _fixture_plain_workspace(tmp_path): resolver = Resolver() ws = resolver.workspace_from_nothing(directory=tmp_path) prev_dir = abspath(curdir) chdir(tmp_path) yield ws chdir(prev_dir)
class TestWorkspace(TestCase): def setUp(self): self.resolver = Resolver() def test_workspace_add_file(self): with TemporaryDirectory() as tempdir: ws1 = self.resolver.workspace_from_nothing(directory=tempdir) fpath = join(tempdir, 'ID1.tif') ws1.add_file('GRP', ID='ID1', mimetype='image/tiff', content='CONTENT', local_filename=fpath) f = ws1.mets.find_files()[0] self.assertEqual(f.ID, 'ID1') self.assertEqual(f.mimetype, 'image/tiff') self.assertEqual(f.url, fpath) self.assertEqual(f.local_filename, fpath) self.assertTrue(exists(fpath)) def test_workspace_add_file_basename_no_content(self): with TemporaryDirectory() as tempdir: ws1 = self.resolver.workspace_from_nothing(directory=tempdir) ws1.add_file('GRP', ID='ID1', mimetype='image/tiff') f = ws1.mets.find_files()[0] self.assertEqual(f.url, None) def test_workspace_add_file_binary_content(self): with TemporaryDirectory() as tempdir: ws1 = self.resolver.workspace_from_nothing(directory=tempdir) fpath = join(tempdir, 'subdir', 'ID1.tif') ws1.add_file('GRP', ID='ID1', content=b'CONTENT', local_filename=fpath, url='http://foo/bar') self.assertTrue(exists(fpath)) def test_workspacec_add_file_content_wo_local_filename(self): with TemporaryDirectory() as tempdir: ws1 = self.resolver.workspace_from_nothing(directory=tempdir) with self.assertRaisesRegex( Exception, "'content' was set but no 'local_filename'"): ws1.add_file('GRP', ID='ID1', content=b'CONTENT') def test_workspace_str(self): with TemporaryDirectory() as tempdir: ws1 = self.resolver.workspace_from_nothing(directory=tempdir) ws1.save_mets() ws1.reload_mets() self.assertEqual( str(ws1), 'Workspace[directory=%s, baseurl=None, file_groups=[], files=[]]' % tempdir) def test_workspace_backup(self): with TemporaryDirectory() as tempdir: ws1 = self.resolver.workspace_from_nothing(directory=tempdir) ws1.automatic_backup = True ws1.save_mets() ws1.reload_mets() self.assertEqual( str(ws1), 'Workspace[directory=%s, baseurl=None, file_groups=[], files=[]]' % tempdir) def test_download_url0(self): with TemporaryDirectory() as directory: ws1 = self.resolver.workspace_from_nothing(directory) fn = ws1.download_url(abspath(__file__)) self.assertEqual(fn, join('TEMP', basename(__file__))) def test_download_url_without_baseurl(self): with TemporaryDirectory() as tempdir: dst_mets = join(tempdir, 'mets.xml') copyfile(SRC_METS, dst_mets) ws1 = self.resolver.workspace_from_url(dst_mets) with self.assertRaisesRegex( Exception, "Already tried prepending baseurl '%s'" % tempdir): ws1.download_url(SAMPLE_FILE_URL) def test_download_url_with_baseurl(self): with TemporaryDirectory() as tempdir: dst_mets = join(tempdir, 'mets.xml') copyfile(SRC_METS, dst_mets) ws1 = self.resolver.workspace_from_url( dst_mets, src_baseurl=dirname(SRC_METS)) f = Path(ws1.download_url(SAMPLE_FILE_URL)) self.assertEqual(f, Path('TEMP', '%s.tif' % SAMPLE_FILE_ID)) self.assertTrue(Path(ws1.directory, f).exists()) def test_from_url_dst_dir_download(self): """ https://github.com/OCR-D/core/issues/319 """ with TemporaryDirectory() as tempdir: ws_dir = join(tempdir, 'non-existing-for-good-measure') # Create a relative path to trigger #319 src_path = str( Path(assets.path_to( 'kant_aufklaerung_1784/data/mets.xml')).relative_to( Path.cwd())) self.resolver.workspace_from_url(src_path, dst_dir=ws_dir, download=True) self.assertTrue( Path(ws_dir, 'mets.xml').exists()) # sanity check, mets.xml must exist self.assertTrue( Path(ws_dir, 'OCR-D-GT-PAGE/PAGE_0017_PAGE.xml').exists()) def test_superfluous_copies_in_ws_dir(self): """ https://github.com/OCR-D/core/issues/227 """ def find_recursive(root): ret = [] for _, _, f in walk(root): for file in f: ret.append(file) return ret with TemporaryDirectory() as wsdir: with open( assets.path_to( 'SBB0000F29300010000/data/mets_one_file.xml'), 'r') as f_in: with open(join(wsdir, 'mets.xml'), 'w') as f_out: f_out.write(f_in.read()) self.assertEqual(len(find_recursive(wsdir)), 1) ws1 = Workspace(self.resolver, wsdir) for file in ws1.mets.find_files(): ws1.download_file(file) self.assertEqual(len(find_recursive(wsdir)), 2) self.assertTrue( exists(join(wsdir, 'OCR-D-IMG/FILE_0005_IMAGE.tif'))) def test_remove_file_force(self): with copy_of_directory( assets.path_to('SBB0000F29300010000/data')) as tempdir: workspace = Workspace(self.resolver, directory=tempdir) with self.assertRaisesRegex(FileNotFoundError, "not found"): # should fail workspace.remove_file('non-existing-id') # should succeed workspace.remove_file('non-existing-id', force=True) def test_remove_file_remote(self): with TemporaryDirectory() as tempdir: ws = self.resolver.workspace_from_nothing(directory=tempdir) ws.add_file('IMG', ID='page1_img', mimetype='image/tiff', url='http://remote') with self.assertRaisesRegex(Exception, "not locally available"): # should fail ws.remove_file('page1_img') # should succeed ws.remove_file('page1_img', force=True) def test_remove_file_group_force(self): with copy_of_directory( assets.path_to('SBB0000F29300010000/data')) as tempdir: workspace = Workspace(self.resolver, directory=tempdir) with self.assertRaisesRegex(Exception, "No such fileGrp"): # raise error unless force workspace.remove_file_group('I DO NOT EXIST') # no error workspace.remove_file_group('I DO NOT EXIST', force=True) def test_remove_file_group_rmdir(self): with copy_of_directory( assets.path_to('SBB0000F29300010000/data')) as tempdir: workspace = Workspace(self.resolver, directory=tempdir) self.assertTrue(exists(join(tempdir, 'OCR-D-IMG'))) workspace.remove_file_group('OCR-D-IMG', recursive=True) self.assertFalse(exists(join(tempdir, 'OCR-D-IMG'))) def test_download_to_directory_from_workspace_download_file(self): """ https://github.com/OCR-D/core/issues/342 """ # tempdir = mkdtemp() with TemporaryDirectory() as tempdir: ws1 = self.resolver.workspace_from_nothing(directory=tempdir) f1 = ws1.add_file('IMG', ID='page1_img', mimetype='image/tiff', local_filename='test.tif', content='') f2 = ws1.add_file('GT', ID='page1_gt', mimetype='text/xml', local_filename='test.xml', content='') self.assertEqual(f1.url, 'test.tif') self.assertEqual(f2.url, 'test.xml') # these should be no-ops ws1.download_file(f1) ws1.download_file(f2) self.assertEqual(f1.url, 'test.tif') self.assertEqual(f2.url, 'test.xml') def test_save_image_file(self): from PIL import Image img = Image.new('RGB', (1000, 1000)) with TemporaryDirectory() as tempdir: ws = self.resolver.workspace_from_nothing(directory=tempdir) with self.assertRaisesRegex(KeyError, ''): ws.save_image_file(img, 'page1_img', 'IMG', 'page1', 'ceci/nest/pas/une/mimetype') ws.save_image_file(img, 'page1_img', 'IMG', 'page1', 'image/jpeg') self.assertTrue(exists(join(tempdir, 'IMG', 'page1_img.jpg')))
def convert(cocofile, directory): """Convert MS-COCO JSON to METS/PAGE XML files. Load JSON ``cocofile`` (in MS-COCO format) and chdir to ``directory`` (which it refers to). Start a METS file mets.xml with references to the image files (under fileGrp ``OCR-D-IMG``) and their corresponding PAGE-XML annotations (under fileGrp ``OCR-D-GT-SEG-BLOCK``), as parsed from ``cocofile`` and written using the same basename. """ resolver = Resolver() with pushd_popd(directory): workspace = resolver.workspace_from_nothing('.') # https://github.com/ibm-aur-nlp/PubLayNet workspace.mets.unique_identifier = 'ocrd_PubLayNet_' + directory coco = json.load(cocofile) LOG.info('Loaded JSON for %d images with %d regions in %d categories', len(coco['images']), len(coco['annotations']), len(coco['categories'])) categories = dict() for cat in coco['categories']: categories[cat['id']] = cat['name'] images = dict() for image in coco['images']: images[image['id']] = image for annotation in coco['annotations']: image = images[annotation['image_id']] regions = image.setdefault('regions', list()) regions.append(annotation) del coco LOG.info('Parsing annotations into PAGE-XML') for image in images.values(): page_id = 'p' + str(image['id']) file_base, file_ext = os.path.splitext(image['file_name']) filename = file_base + '.xml' image_file = workspace.add_file('OCR-D-IMG', ID='OCR-D-IMG_' + page_id, pageId=page_id, mimetype=EXT_TO_MIME[file_ext], local_filename=image['file_name']) LOG.info('Added page %s file %s of type %s', image_file.pageId, image_file.local_filename, image_file.mimetype) pcgts = page_from_image(image_file) pcgts.set_pcGtsId(page_id) page = pcgts.get_Page() assert page.imageWidth == image['width'] assert page.imageHeight == image['height'] for region in image['regions']: polygon = np.array(region['segmentation']) polygon = np.reshape(polygon, (polygon.shape[1] // 2, 2)) coords = CoordsType(points=points_from_polygon(polygon)) category = categories[region['category_id']] region_id = 'r' + str(region['id']) if category == 'text': region_obj = TextRegionType( id=region_id, Coords=coords, type_=TextTypeSimpleType.PARAGRAPH) page.add_TextRegion(region_obj) elif category == 'title': region_obj = TextRegionType( id=region_id, Coords=coords, type_=TextTypeSimpleType.HEADING) # CAPTION? page.add_TextRegion(region_obj) elif category == 'list': region_obj = TextRegionType( id=region_id, Coords=coords, type_=TextTypeSimpleType.LISTLABEL) # OTHER? page.add_TextRegion(region_obj) elif category == 'table': region_obj = TableRegionType(id=region_id, Coords=coords) page.add_TableRegion(region_obj) elif category == 'figure': region_obj = ImageRegionType(id=region_id, Coords=coords) page.add_ImageRegion(region_obj) else: raise Exception('unknown image category: %s' % category) page_file = workspace.add_file('OCR-D-GT-SEG-BLOCK', ID='OCR-D-GT-SEG-BLOCK_' + page_id, pageId=page_id, mimetype=MIMETYPE_PAGE, local_filename=filename, content=to_xml(pcgts)) LOG.info('Added page %s file %s with %d regions', page_file.pageId, page_file.local_filename, len(image['regions'])) LOG.info('All done') workspace.save_mets()
class TestResolver(TestCase): def setUp(self): self.resolver = Resolver() self.folder = join(TMP_FOLDER, 'kant_aufklaerung_1784') if exists(TMP_FOLDER): rmtree(TMP_FOLDER) os.makedirs(TMP_FOLDER) copytree(FOLDER_KANT, self.folder) def test_workspace_from_url_bad(self): with self.assertRaisesRegex(Exception, "Must pass mets_url and/or baseurl"): self.resolver.workspace_from_url(None) def test_workspace_from_url_tempdir(self): self.resolver.workspace_from_url( mets_basename='foo.xml', mets_url= 'https://raw.githubusercontent.com/OCR-D/assets/master/data/kant_aufklaerung_1784/data/mets.xml' ) def test_workspace_from_url_download(self): with TemporaryDirectory() as dst_dir: self.resolver.workspace_from_url( mets_basename='foo.xml', dst_dir=dst_dir, download=True, mets_url= 'https://raw.githubusercontent.com/OCR-D/assets/master/data/kant_aufklaerung_1784/data/mets.xml' ) def test_workspace_from_url_no_clobber(self): with self.assertRaisesRegex( Exception, "already exists but clobber_mets is false"): with TemporaryDirectory() as dst_dir: with open(join(dst_dir, 'mets.xml'), 'w') as f: f.write('CONTENT') self.resolver.workspace_from_url( dst_dir=dst_dir, mets_url= 'https://raw.githubusercontent.com/OCR-D/assets/master/data/kant_aufklaerung_1784/data/mets.xml' ) def test_workspace_from_url_404(self): with self.assertRaisesRegex(Exception, "Not found"): self.resolver.workspace_from_url( mets_url= 'https://raw.githubusercontent.com/OCR-D/assets/master/data/kant_aufklaerung_1784/data/mets.xmlX' ) def test_workspace_from_url_rel_dir(self): with TemporaryDirectory() as dst_dir: os.chdir(FOLDER_KANT) self.resolver.workspace_from_url( None, baseurl='data', dst_dir='../../../../../../../../../../../../../../../../' + dst_dir[1:]) os.chdir(oldpwd) def test_workspace_from_url(self): workspace = self.resolver.workspace_from_url(METS_HEROLD) # print(METS_HEROLD) # print(workspace.mets) input_files = workspace.mets.find_files(fileGrp='OCR-D-IMG') # print [str(f) for f in input_files] image_file = input_files[0] # print(image_file) f = workspace.download_file(image_file) self.assertEqual(f.ID, 'FILE_0001_IMAGE') # print(f) def test_resolve_image(self): workspace = self.resolver.workspace_from_url(METS_HEROLD) input_files = workspace.mets.find_files(fileGrp='OCR-D-IMG') f = input_files[0] print(f.url) img_pil1 = workspace.resolve_image_as_pil(f.url) self.assertEqual(img_pil1.size, (2875, 3749)) img_pil2 = workspace.resolve_image_as_pil(f.url, [[0, 0], [1, 1]]) self.assertEqual(img_pil2.size, (1, 1)) img_pil2 = workspace.resolve_image_as_pil(f.url, [[0, 0], [1, 1]]) def test_resolve_image_grayscale(self): img_url = assets.url_of( 'kant_aufklaerung_1784-binarized/data/OCR-D-IMG-NRM/OCR-D-IMG-NRM_0017' ) workspace = self.resolver.workspace_from_url(METS_HEROLD) img_pil1 = workspace.resolve_image_as_pil(img_url) self.assertEqual(img_pil1.size, (1457, 2083)) img_pil2 = workspace.resolve_image_as_pil(img_url, [[0, 0], [1, 1]]) self.assertEqual(img_pil2.size, (1, 1)) def test_resolve_image_bitonal(self): img_url = assets.url_of( 'kant_aufklaerung_1784-binarized/data/OCR-D-IMG-1BIT/OCR-D-IMG-1BIT_0017' ) workspace = self.resolver.workspace_from_url(METS_HEROLD) img_pil1 = workspace.resolve_image_as_pil(img_url) self.assertEqual(img_pil1.size, (1457, 2083)) img_pil2 = workspace.resolve_image_as_pil(img_url, [[0, 0], [1, 1]]) self.assertEqual(img_pil2.size, (1, 1)) def test_workspace_from_nothing(self): ws1 = self.resolver.workspace_from_nothing(None) self.assertIsNotNone(ws1.mets) tmp_dir = join(TMP_FOLDER, 'from-nothing') ws2 = self.resolver.workspace_from_nothing(tmp_dir) self.assertEqual(ws2.directory, tmp_dir) try: ws2 = self.resolver.workspace_from_nothing(tmp_dir) self.assertTrue(False, "expecting to fail") except Exception as e: self.assertTrue('Not clobbering' in str(e)) def test_download_to_directory_badargs_url(self): with self.assertRaisesRegex(Exception, "'url' must be a string"): self.resolver.download_to_directory(None, None) def test_download_to_directory_badargs_directory(self): with self.assertRaisesRegex(Exception, "'directory' must be a string"): self.resolver.download_to_directory(None, 'foo') def test_download_to_directory_default(self): tmp_dir = join(TMP_FOLDER, 'target') fn = self.resolver.download_to_directory( tmp_dir, 'file://' + join(self.folder, 'data/mets.xml')) self.assertEqual( fn, join(tmp_dir, 'file%s.data.mets.xml' % sub(r'[/_\.\-]', '.', self.folder))) def test_download_to_directory_basename(self): tmp_dir = join(TMP_FOLDER, 'target') fn = self.resolver.download_to_directory( tmp_dir, 'file://' + join(self.folder, 'data/mets.xml'), basename='foo') self.assertEqual(fn, join(tmp_dir, 'foo')) def test_download_to_directory_subdir(self): tmp_dir = join(TMP_FOLDER, 'target') fn = self.resolver.download_to_directory( tmp_dir, 'file://' + join(self.folder, 'data/mets.xml'), subdir='baz') self.assertEqual(fn, join(tmp_dir, 'baz', 'mets.xml')) def test_workspace_add_file(self): with TemporaryDirectory() as tempdir: ws1 = self.resolver.workspace_from_nothing(directory=tempdir) fpath = join(tempdir, 'ID1.tif') ws1.add_file('GRP', ID='ID1', mimetype='image/tiff', content='CONTENT', local_filename=fpath) f = ws1.mets.find_files()[0] self.assertEqual(f.ID, 'ID1') self.assertEqual(f.mimetype, 'image/tiff') self.assertEqual(f.url, fpath) self.assertEqual(f.local_filename, fpath) self.assertTrue(exists(fpath)) def test_workspace_add_file_basename_no_content(self): with TemporaryDirectory() as tempdir: ws1 = self.resolver.workspace_from_nothing(directory=tempdir) ws1.add_file('GRP', ID='ID1', mimetype='image/tiff') f = ws1.mets.find_files()[0] self.assertEqual(f.url, '') def test_workspace_add_file_binary_content(self): with TemporaryDirectory() as tempdir: ws1 = self.resolver.workspace_from_nothing(directory=tempdir) fpath = join(tempdir, 'subdir', 'ID1.tif') ws1.add_file('GRP', ID='ID1', content=b'CONTENT', local_filename=fpath, url='http://foo/bar') self.assertTrue(exists(fpath)) def test_workspacec_add_file_content_wo_local_filename(self): with TemporaryDirectory() as tempdir: ws1 = self.resolver.workspace_from_nothing(directory=tempdir) with self.assertRaisesRegex( Exception, "'content' was set but no 'local_filename'"): ws1.add_file('GRP', ID='ID1', content=b'CONTENT') def test_workspace_str(self): with TemporaryDirectory() as tempdir: ws1 = self.resolver.workspace_from_nothing(directory=tempdir) ws1.save_mets() ws1.reload_mets() self.assertEqual( str(ws1), 'Workspace[directory=%s, file_groups=[], files=[]]' % tempdir) def test_workspace_backup(self): with TemporaryDirectory() as tempdir: ws1 = self.resolver.workspace_from_nothing(directory=tempdir) ws1.automatic_backup = True ws1.save_mets() ws1.reload_mets() self.assertEqual( str(ws1), 'Workspace[directory=%s, file_groups=[], files=[]]' % tempdir) def test_227_1(self): def find_recursive(root): ret = [] for _, _, f in os.walk(root): for file in f: ret.append(file) return ret with TemporaryDirectory() as wsdir: with open( assets.path_to( 'SBB0000F29300010000/data/mets_one_file.xml'), 'r') as f_in: with open(join(wsdir, 'mets.xml'), 'w') as f_out: f_out.write(f_in.read()) self.assertEqual(len(find_recursive(wsdir)), 1) ws1 = Workspace(self.resolver, wsdir) for file in ws1.mets.find_files(): ws1.download_file(file) self.assertEqual(len(find_recursive(wsdir)), 2) self.assertTrue(exists(join(wsdir, 'OCR-D-IMG/FILE_0005_IMAGE')))
class TestCli(TestCase): def setUp(self): self.maxDiff = None self.resolver = Resolver() initLogging() self.runner = CliRunner() def test_add(self): """ Ensure that `ocrd workspace add` does the right thing """ ID = 'foo123file' page_id = 'foo123page' file_grp = 'TEST_GROUP' content = 'x' mimetype = 'image/tiff' local_filename = join(file_grp, 'foo.xml') # mets_api = None # mets_cli = None with TemporaryDirectory() as tempdir: ws_api = self.resolver.workspace_from_nothing(directory=tempdir) ws_api.add_file(file_grp, ID=ID, content=content, pageId=page_id, mimetype=mimetype, local_filename=local_filename) ws_api.save_mets() # mets_api = ws_api.mets.to_xml().decode('utf8') with TemporaryDirectory() as tempdir: ws_api = self.resolver.workspace_from_nothing(directory=tempdir) content_file = join(tempdir, 'testfile') with open(content_file, 'w') as f: f.write(content) result = self.runner.invoke(workspace_cli, [ '-d', tempdir, 'add', '--file-grp', file_grp, '--page-id', page_id, '--file-id', ID, '--mimetype', mimetype, content_file ]) self.assertEqual(result.exit_code, 0) # TODO too complex to compare :( # with open(join(tempdir, 'mets.xml')) as f: # mets_cli = f.read() # print(mets_api) # print(mets_cli) # self.assertEqual(mets_api, mets_cli) # print(result.output) # with open(join(tempdir, 'mets.xml')) as f: # print(f.read()) self.assertEqual(result.exit_code, 0) def test_add_remove(self): ID = 'foo123file' page_id = 'foo123page' file_grp = 'TEST_GROUP' content = 'x' mimetype = 'image/tiff' with TemporaryDirectory() as tempdir: content_file = join(tempdir, 'testfile') with open(content_file, 'w') as f: f.write(content) result = self.runner.invoke(workspace_cli, ['init', tempdir]) self.assertEqual(result.exit_code, 0) result = self.runner.invoke(workspace_cli, [ '-d', tempdir, 'add', '--file-grp', file_grp, '--page-id', page_id, '--file-id', ID, '--mimetype', mimetype, content_file ]) self.assertEqual(result.exit_code, 0) result = self.runner.invoke(workspace_cli, ['-d', tempdir, 'remove', ID]) self.assertEqual(result.exit_code, 0) # File should still exist self.assertTrue(exists(content_file)) def test_add_remove_force(self): ID = 'foo123file' page_id = 'foo123page' file_grp = 'TEST_GROUP' content = 'x' mimetype = 'image/tiff' with TemporaryDirectory() as tempdir: content_file = join(tempdir, 'testfile') with open(content_file, 'w') as f: f.write(content) result = self.runner.invoke(workspace_cli, ['init', tempdir]) self.assertEqual(result.exit_code, 0) result = self.runner.invoke(workspace_cli, [ '-d', tempdir, 'add', '--file-grp', file_grp, '--page-id', page_id, '--file-id', ID, '--mimetype', mimetype, content_file ]) self.assertEqual(result.exit_code, 0) result = self.runner.invoke( workspace_cli, ['-d', tempdir, 'remove', '--force', ID]) print(result) print(result.output) self.assertEqual(result.exit_code, 0) # File should have been deleted self.assertFalse(exists(content_file)) def test_find_files(self): with TemporaryDirectory() as tempdir: wsdir = join(tempdir, 'ws') copytree(assets.path_to('SBB0000F29300010000/data'), wsdir) with pushd_popd(wsdir): result = self.runner.invoke( workspace_cli, ['find', '-G', 'OCR-D-IMG-BIN', '-k', 'fileGrp']) self.assertEqual(result.output, 'OCR-D-IMG-BIN\nOCR-D-IMG-BIN\n') self.assertEqual(result.exit_code, 0) def test_prune_files(self): with TemporaryDirectory() as tempdir: copytree(assets.path_to('SBB0000F29300010000/data'), join(tempdir, 'ws')) ws1 = self.resolver.workspace_from_url( join(tempdir, 'ws', 'mets.xml')) self.assertEqual(len(ws1.mets.find_files()), 35) result = self.runner.invoke( workspace_cli, ['-d', join(tempdir, 'ws'), 'prune-files']) self.assertEqual(result.exit_code, 0) ws2 = self.resolver.workspace_from_url( join(tempdir, 'ws', 'mets.xml')) self.assertEqual(len(ws2.mets.find_files()), 7) def test_remove_file_group(self): """ Test removal of filegrp """ with TemporaryDirectory() as tempdir: wsdir = join(tempdir, 'ws') copytree(assets.path_to('SBB0000F29300010000/data'), wsdir) file_group = 'OCR-D-GT-PAGE' file_path = join(tempdir, 'ws', file_group, 'FILE_0002_FULLTEXT.xml') self.assertTrue(exists(file_path)) workspace = self.resolver.workspace_from_url( join(wsdir, 'mets.xml')) self.assertEqual(workspace.directory, wsdir) with self.assertRaisesRegex(Exception, "not empty"): workspace.remove_file_group(file_group) with self.assertRaisesRegex(Exception, "force without recursive"): workspace.remove_file_group(file_group, force=True) self.assertTrue(exists(file_path)) self.assertEqual(len(workspace.mets.file_groups), 17) self.assertEqual(len(workspace.mets.find_files()), 35) workspace.remove_file_group(file_group, recursive=True, force=True) self.assertEqual(len(workspace.mets.file_groups), 16) self.assertEqual(len(workspace.mets.find_files()), 33) self.assertFalse(exists(file_path)) def test_copy_vs_clone(self): src_dir = assets.path_to('kant_aufklaerung_1784/data') with TemporaryDirectory() as tempdir: # cloned without download shallowcloneddir = join(tempdir, 'cloned-shallow') # cloned with download fullcloneddir = join(tempdir, 'cloned-all') # copied copieddir = join(tempdir, 'copied') Path(fullcloneddir).mkdir() Path(shallowcloneddir).mkdir() result = self.runner.invoke( workspace_cli, ['clone', join(src_dir, 'mets.xml'), shallowcloneddir]) self.assertEqual(result.exit_code, 0) result = self.runner.invoke( workspace_cli, ['clone', '-a', join(src_dir, 'mets.xml'), fullcloneddir]) self.assertEqual(result.exit_code, 0) with copy_of_directory(src_dir, copieddir): shallow_vs_copied = dircmp(shallowcloneddir, copieddir) self.assertEqual( set(shallow_vs_copied.right_only), set(['OCR-D-GT-ALTO', 'OCR-D-GT-PAGE', 'OCR-D-IMG'])) full_vs_copied = dircmp(fullcloneddir, copieddir) # print(full_vs_copied) # from ocrd_utils import pushd_popd # with pushd_popd(tempdir): # import os # os.system("diff %s/mets.xml %s/mets.xml" % (fullcloneddir, copieddir)) # XXX mets.xml will not have the exact same content because # URLs that are actually files will be marked up as such with # @LOCTYPE/@OTHERLOCTYPE # self.assertEqual(full_vs_copied.diff_files, []) self.assertEqual(full_vs_copied.left_only, []) self.assertEqual(full_vs_copied.right_only, [])
class TestResolver(TestCase): def setUp(self): self.resolver = Resolver() def test_workspace_from_url_bad(self): with self.assertRaisesRegex(Exception, "Must pass 'mets_url'"): self.resolver.workspace_from_url(None) def test_workspace_from_url_tempdir(self): self.resolver.workspace_from_url( mets_basename='foo.xml', mets_url='https://raw.githubusercontent.com/OCR-D/assets/master/data/kant_aufklaerung_1784/data/mets.xml') def test_workspace_from_url_download(self): with TemporaryDirectory() as dst_dir: self.resolver.workspace_from_url( 'https://raw.githubusercontent.com/OCR-D/assets/master/data/kant_aufklaerung_1784/data/mets.xml', mets_basename='foo.xml', dst_dir=dst_dir, download=True) def test_workspace_from_url_no_clobber(self): with TemporaryDirectory() as dst_dir: src_mets = Path(assets.path_to('kant_aufklaerung_1784-binarized/data/mets.xml')) dst_mets = Path(dst_dir, 'mets.xml') dst_mets.write_text(src_mets.read_text()) self.resolver.workspace_from_url( 'https://raw.githubusercontent.com/OCR-D/assets/master/data/kant_aufklaerung_1784/data/mets.xml', clobber_mets=False, dst_dir=dst_dir) def test_workspace_from_url_404(self): with self.assertRaisesRegex(Exception, "HTTP request failed"): self.resolver.workspace_from_url(mets_url='https://raw.githubusercontent.com/OCR-D/assets/master/data/kant_aufklaerung_1784/data/mets.xmlX') def test_workspace_from_url_rel_dir(self): with TemporaryDirectory() as dst_dir: bogus_dst_dir = '../../../../../../../../../../../../../../../../%s' % dst_dir[1:] with pushd_popd(FOLDER_KANT): ws1 = self.resolver.workspace_from_url('data/mets.xml', dst_dir=bogus_dst_dir) self.assertEqual(ws1.mets_target, pjoin(dst_dir, 'mets.xml')) self.assertEqual(ws1.directory, dst_dir) def test_workspace_from_url0(self): workspace = self.resolver.workspace_from_url(METS_HEROLD) # print(workspace.mets) input_files = workspace.mets.find_files(fileGrp='OCR-D-IMG') # print [str(f) for f in input_files] image_file = input_files[0] # print(image_file) f = workspace.download_file(image_file) self.assertEqual('%s.tif' % f.ID, 'FILE_0001_IMAGE.tif') self.assertEqual(f.local_filename, 'OCR-D-IMG/FILE_0001_IMAGE.tif') # print(f) # pylint: disable=protected-access def test_resolve_image0(self): workspace = self.resolver.workspace_from_url(METS_HEROLD) input_files = workspace.mets.find_files(fileGrp='OCR-D-IMG') f = input_files[0] print(f.url) img_pil1 = workspace._resolve_image_as_pil(f.url) print(f.url) self.assertEqual(img_pil1.size, (2875, 3749)) img_pil2 = workspace._resolve_image_as_pil(f.url, [[0, 0], [1, 1]]) print(f.url) self.assertEqual(img_pil2.size, (1, 1)) img_pil2 = workspace._resolve_image_as_pil(f.url, [[0, 0], [1, 1]]) # pylint: disable=protected-access def test_resolve_image_grayscale(self): img_url = assets.url_of('kant_aufklaerung_1784-binarized/data/OCR-D-IMG-NRM/OCR-D-IMG-NRM_0017.png') workspace = self.resolver.workspace_from_url(assets.url_of('SBB0000F29300010000/data/mets.xml')) img_pil1 = workspace.resolve_image_as_pil(img_url) self.assertEqual(img_pil1.size, (1457, 2083)) img_pil2 = workspace._resolve_image_as_pil(img_url, [[0, 0], [1, 1]]) self.assertEqual(img_pil2.size, (1, 1)) # pylint: disable=protected-access def test_resolve_image_bitonal(self): img_url = assets.url_of('kant_aufklaerung_1784-binarized/data/OCR-D-IMG-1BIT/OCR-D-IMG-1BIT_0017.png') workspace = self.resolver.workspace_from_url(METS_HEROLD) img_pil1 = workspace._resolve_image_as_pil(img_url) self.assertEqual(img_pil1.size, (1457, 2083)) img_pil2 = workspace._resolve_image_as_pil(img_url, [[0, 0], [1, 1]]) self.assertEqual(img_pil2.size, (1, 1)) def test_workspace_from_nothing(self): ws1 = self.resolver.workspace_from_nothing(None) self.assertIsNotNone(ws1.mets) def test_workspace_from_nothing_makedirs(self): with TemporaryDirectory() as tempdir: non_existant_dir = Path(tempdir, 'target') ws1 = self.resolver.workspace_from_nothing(non_existant_dir) self.assertEqual(ws1.directory, non_existant_dir) def test_workspace_from_nothing_noclobber(self): with TemporaryDirectory() as tempdir: ws2 = self.resolver.workspace_from_nothing(tempdir) self.assertEqual(ws2.directory, tempdir) with self.assertRaisesRegex(Exception, "METS 'mets.xml' already exists in '%s' and clobber_mets not set" % tempdir): # must fail because tempdir was just created self.resolver.workspace_from_nothing(tempdir) def test_download_to_directory_badargs_url(self): with self.assertRaisesRegex(Exception, "'url' must be a string"): self.resolver.download_to_directory(None, None) def test_download_to_directory_badargs_directory(self): with self.assertRaisesRegex(Exception, "'directory' must be a string"): self.resolver.download_to_directory(None, 'foo') def test_download_to_directory_default(self): with copy_of_directory(FOLDER_KANT) as src: with TemporaryDirectory() as dst: fn = self.resolver.download_to_directory(dst, pjoin(src, 'data/mets.xml')) self.assertEqual(fn, 'mets.xml') self.assertTrue(Path(dst, fn).exists()) def test_download_to_directory_basename(self): with copy_of_directory(FOLDER_KANT) as src: with TemporaryDirectory() as dst: fn = self.resolver.download_to_directory(dst, pjoin(src, 'data/mets.xml'), basename='foo') self.assertEqual(fn, 'foo') self.assertTrue(Path(dst, fn).exists()) def test_download_to_directory_subdir(self): with copy_of_directory(FOLDER_KANT) as src: with TemporaryDirectory() as dst: fn = self.resolver.download_to_directory(dst, pjoin(src, 'data/mets.xml'), subdir='baz') self.assertEqual(fn, pjoin('baz', 'mets.xml')) self.assertTrue(Path(dst, fn).exists())
class TestWorkspaceValidator(TestCase): def setUp(self): super().setUp() self.resolver = Resolver() def test_check_file_grp_basic(self): workspace = self.resolver.workspace_from_url( assets.url_of('SBB0000F29300010000/data/mets.xml')) report = WorkspaceValidator.check_file_grp(workspace, 'foo', 'bar') self.assertFalse(report.is_valid) self.assertEqual(len(report.errors), 1) self.assertEqual(report.errors[0], "Input fileGrp[@USE='foo'] not in METS!") report = WorkspaceValidator.check_file_grp(workspace, 'OCR-D-IMG', 'OCR-D-IMG-BIN') self.assertFalse(report.is_valid) self.assertEqual(len(report.errors), 1) self.assertEqual( report.errors[0], "Output fileGrp[@USE='OCR-D-IMG-BIN'] already in METS!") report = WorkspaceValidator.check_file_grp(workspace, 'OCR-D-IMG,FOO', 'FOO') self.assertFalse(report.is_valid) self.assertEqual(len(report.errors), 1) self.assertEqual(report.errors[0], "Input fileGrp[@USE='FOO'] not in METS!") report = WorkspaceValidator.check_file_grp(workspace, 'OCR-D-IMG,FOO', None) self.assertFalse(report.is_valid) self.assertEqual(len(report.errors), 1) self.assertEqual(report.errors[0], "Input fileGrp[@USE='FOO'] not in METS!") report = WorkspaceValidator.check_file_grp(workspace, None, '') self.assertTrue(report.is_valid) def test_check_file_grp_page_id_str(self): workspace = self.resolver.workspace_from_url( assets.url_of('SBB0000F29300010000/data/mets.xml')) report = WorkspaceValidator.check_file_grp( workspace, 'OCR-D-IMG', 'OCR-D-IMG-BIN', page_id='PHYS_0003,PHYS_0001') self.assertFalse(report.is_valid) self.assertEqual(len(report.errors), 1) self.assertEqual( report.errors[0], "Output fileGrp[@USE='OCR-D-IMG-BIN'] already contains output for page PHYS_0001" ) def test_check_file_grp_page_id_list(self): workspace = self.resolver.workspace_from_url( assets.url_of('SBB0000F29300010000/data/mets.xml')) report = WorkspaceValidator.check_file_grp( workspace, 'OCR-D-IMG', 'OCR-D-IMG-BIN', page_id=['PHYS_0003', 'PHYS_0001']) self.assertFalse(report.is_valid) self.assertEqual(len(report.errors), 1) def test_check_file_grp_page_id_valid(self): workspace = self.resolver.workspace_from_url( assets.url_of('SBB0000F29300010000/data/mets.xml')) report = WorkspaceValidator.check_file_grp(workspace, 'OCR-D-IMG', 'OCR-D-IMG-BIN', page_id='PHYS_0004') self.assertTrue(report.is_valid) def test_simple(self): report = WorkspaceValidator.validate( self.resolver, assets.url_of('SBB0000F29300010000/data/mets_one_file.xml'), download=True) self.assertTrue(report.is_valid) def test_validate_twice(self): validator = WorkspaceValidator( self.resolver, assets.url_of('SBB0000F29300010000/data/mets_one_file.xml'), download=True) report = validator._validate() # pylint: disable=protected-access report = validator._validate() # pylint: disable=protected-access print(report.errors) self.assertTrue(report.is_valid) def test_validate_empty(self): with TemporaryDirectory() as tempdir: workspace = self.resolver.workspace_from_nothing(directory=tempdir) report = WorkspaceValidator.validate(self.resolver, join(tempdir, 'mets.xml')) self.assertEqual(len(report.errors), 3) # no-files, missing id, missing fileGrp self.assertIn('no unique identifier', report.errors[0]) self.assertIn('No files', report.errors[1]) workspace.mets.unique_identifier = 'foobar' workspace.save_mets() report = WorkspaceValidator.validate(self.resolver, join(tempdir, 'mets.xml')) self.assertEqual(len(report.errors), 2) def test_validate_file_groups_non_ocrd(self): with TemporaryDirectory() as tempdir: workspace = self.resolver.workspace_from_nothing(directory=tempdir) workspace.mets.unique_identifier = 'foobar' workspace.mets.add_file_group('FOO') workspace.save_mets() report = WorkspaceValidator.validate(self.resolver, join(tempdir, 'mets.xml')) self.assertEqual(len(report.errors), 1) self.assertIn('No files', report.errors[0]) self.assertEqual(len(report.notices), 1) self.assertIn("USE does not begin with 'OCR-D-'", report.notices[0]) def test_validate_file_groups_unspecified(self): with TemporaryDirectory() as tempdir: workspace = self.resolver.workspace_from_nothing(directory=tempdir) workspace.mets.unique_identifier = 'foobar' workspace.mets.add_file_group('OCR-D-INVALID-FILEGRP') workspace.save_mets() report = WorkspaceValidator.validate(self.resolver, join(tempdir, 'mets.xml')) print(report.notices) self.assertEqual(len(report.errors), 1) self.assertEqual(len(report.notices), 1) self.assertEqual( report.notices[0], "Unspecified USE category 'INVALID' in fileGrp 'OCR-D-INVALID-FILEGRP'" ) self.assertIn('No files', report.errors[0]) def test_validate_file_groups_bad_name(self): with TemporaryDirectory() as tempdir: workspace = self.resolver.workspace_from_nothing(directory=tempdir) workspace.mets.unique_identifier = 'foobar' workspace.mets.add_file_group('OCR-D-GT-X') workspace.save_mets() report = WorkspaceValidator.validate(self.resolver, join(tempdir, 'mets.xml')) self.assertEqual(len(report.errors), 1) self.assertEqual(len(report.notices), 1) self.assertIn("Invalid USE name 'X' in fileGrp", report.notices[0]) self.assertIn('No files', report.errors[0]) def test_validate_files_nopageid(self): with TemporaryDirectory() as tempdir: workspace = self.resolver.workspace_from_nothing(directory=tempdir) workspace.mets.unique_identifier = 'foobar' workspace.mets.add_file('OCR-D-GT-PAGE', ID='file1', mimetype='image/png', url='http://foo') workspace.save_mets() report = WorkspaceValidator.validate( self.resolver, join(tempdir, 'mets.xml'), skip=['pixel_density', 'imagefilename']) self.assertEqual(len(report.errors), 1) self.assertIn("does not manifest any physical page.", report.errors[0]) def test_validate_weird_urls(self): with TemporaryDirectory() as tempdir: workspace = self.resolver.workspace_from_nothing(directory=tempdir) workspace.mets.unique_identifier = 'foobar' workspace.mets.add_file('OCR-D-GT-PAGE', ID='file1', mimetype='image/png', pageId='page1', url='file:/java-file-url') f = workspace.mets.add_file('OCR-D-GT-PAGE', ID='file2', mimetype='image/png', pageId='page2', url='nothttp://unusual.scheme') f._el.set('GROUPID', 'donotuse') # pylint: disable=protected-access workspace.save_mets() report = WorkspaceValidator.validate(self.resolver, join(tempdir, 'mets.xml'), skip=['pixel_density']) assert not report.is_valid assert len(report.errors) == 2 assert "invalid (Java-specific) file URL" in report.errors[0] def test_validate_pixel_no_download(self): imgpath = assets.path_to( 'kant_aufklaerung_1784-binarized/data/OCR-D-IMG-BIN/BIN_0020.png') with TemporaryDirectory() as tempdir: workspace = self.resolver.workspace_from_nothing(directory=tempdir) workspace.mets.unique_identifier = 'foobar' workspace.mets.add_file('OCR-D-GT-BIN', ID='file1', mimetype='image/png', pageId='page1', url=imgpath) workspace.save_mets() report = WorkspaceValidator.validate(self.resolver, join(tempdir, 'mets.xml'), skip=[], download=False) self.assertEqual(len(report.errors), 0) self.assertEqual(len(report.warnings), 0) self.assertEqual(len(report.notices), 0) def test_validate_pixel_density_too_low(self): imgpath = assets.path_to( 'kant_aufklaerung_1784-binarized/data/OCR-D-IMG-BIN/BIN_0017.png') with TemporaryDirectory() as tempdir: workspace = self.resolver.workspace_from_nothing(directory=tempdir) workspace.mets.unique_identifier = 'foobar' workspace.mets.add_file('OCR-D-GT-BIN', ID='file1', mimetype='image/png', pageId='page1', url=imgpath) workspace.save_mets() report = WorkspaceValidator.validate(self.resolver, join(tempdir, 'mets.xml'), skip=[], download=True) self.assertEqual(len(report.notices), 2) self.assertIn("xResolution", report.notices[0]) self.assertIn("yResolution", report.notices[1]) self.assertEqual(len(report.warnings), 0) self.assertEqual(len(report.errors), 0) def test_bad_workspace(self): report = WorkspaceValidator.validate(self.resolver, 'non existe') self.assertFalse(report.is_valid) self.assertIn('Failed to instantiate workspace:', report.errors[0]) def test_skip_page(self): report = WorkspaceValidator.validate( self.resolver, None, src_dir=assets.path_to('kant_aufklaerung_1784/data'), download=True, skip=[ 'page', 'mets_unique_identifier', 'mets_file_group_names', 'mets_files', 'pixel_density', 'imagefilename', ]) print(report.errors) self.assertTrue(report.is_valid) def test_dimensions(self): with TemporaryDirectory() as tempdir: wsdir = join(tempdir, 'foo') copytree(assets.path_to('kant_aufklaerung_1784/data'), wsdir) with pushd_popd(wsdir): os.system( """sed -i 's,imageHeight="2083",imageHeight="1234",' OCR-D-GT-PAGE/PAGE_0017_PAGE.xml""" ) report = WorkspaceValidator.validate( self.resolver, join(wsdir, 'mets.xml'), src_dir=wsdir, skip=[ 'page', 'mets_unique_identifier', 'mets_file_group_names', 'mets_files', 'pixel_density', 'imagefilename', 'page_xsd', 'mets_xsd' ], download=True) self.assertIn( "PAGE 'PAGE_0017_PAGE': @imageHeight != image's actual height (1234 != 2083)", report.errors) # print(report.errors) self.assertEqual(len(report.errors), 1) self.assertEqual(report.is_valid, False) report2 = WorkspaceValidator.validate( self.resolver, join(wsdir, 'mets.xml'), src_dir=wsdir, skip=[ 'page', 'mets_unique_identifier', 'mets_file_group_names', 'mets_files', 'pixel_density', 'imagefilename', 'dimension', 'page_xsd', 'mets_xsd' ], download=False) self.assertEqual(report2.is_valid, True) def test_src_dir(self): report = WorkspaceValidator.validate( self.resolver, None, src_dir=assets.path_to('kant_aufklaerung_1784/data'), skip=['imagefilename'], download=True, ) print(report.errors) self.assertEqual( len([e for e in report.errors if isinstance(e, ConsistencyError)]), 42, '42 textequiv consistency errors') def test_imagefilename(self): report = WorkspaceValidator.validate( self.resolver, None, src_dir=assets.path_to('kant_aufklaerung_1784/data'), skip=[ 'page', 'mets_unique_identifier', 'mets_file_group_names', 'mets_files', 'pixel_density', 'page_xsd', 'mets_xsd' ], download=False, ) self.assertEqual(len(report.errors), 0) def test_pcgtsid(self): with copy_of_directory( assets.path_to('kant_aufklaerung_1784/data')) as wsdir: with pushd_popd(wsdir): # remove the @pcGtsId attribute for testing os.system( """sed -i 's,pcGtsId.*,pcGtsId="foo">,' OCR-D-GT-PAGE/PAGE_0017_PAGE.xml""" ) report = WorkspaceValidator.validate(self.resolver, join(wsdir, 'mets.xml')) self.assertIn( 'pc:PcGts/@pcGtsId differs from mets:file/@ID: "foo" !== "PAGE_0017_PAGE"', report.warnings)
class TestWorkspace(TestCase): def setUp(self): self.resolver = Resolver() def test_workspace_add_file(self): with TemporaryDirectory() as tempdir: ws1 = self.resolver.workspace_from_nothing(directory=tempdir) fpath = join(tempdir, 'ID1.tif') ws1.add_file('GRP', ID='ID1', mimetype='image/tiff', content='CONTENT', pageId=None, local_filename=fpath) f = ws1.mets.find_all_files()[0] self.assertEqual(f.ID, 'ID1') self.assertEqual(f.mimetype, 'image/tiff') self.assertEqual(f.url, fpath) self.assertEqual(f.local_filename, fpath) self.assertTrue(exists(fpath)) def test_workspace_add_file_basename_no_content(self): with TemporaryDirectory() as tempdir: ws1 = self.resolver.workspace_from_nothing(directory=tempdir) ws1.add_file('GRP', ID='ID1', mimetype='image/tiff', pageId=None) f = next(ws1.mets.find_files()) self.assertEqual(f.url, None) def test_workspace_add_file_binary_content(self): with TemporaryDirectory() as tempdir: ws1 = self.resolver.workspace_from_nothing(directory=tempdir) fpath = join(tempdir, 'subdir', 'ID1.tif') ws1.add_file('GRP', ID='ID1', content=b'CONTENT', local_filename=fpath, url='http://foo/bar', pageId=None) self.assertTrue(exists(fpath)) def test_workspacec_add_file_content_wo_local_filename(self): with TemporaryDirectory() as tempdir: ws1 = self.resolver.workspace_from_nothing(directory=tempdir) with self.assertRaisesRegex( Exception, "'content' was set but no 'local_filename'"): ws1.add_file('GRP', ID='ID1', content=b'CONTENT', pageId='foo1234') def test_workspacec_add_file_content_wo_pageid(self): with TemporaryDirectory() as tempdir: ws1 = self.resolver.workspace_from_nothing(directory=tempdir) with self.assertRaisesRegex( ValueError, "workspace.add_file must be passed a 'pageId' kwarg, even if it is None." ): ws1.add_file('GRP', ID='ID1', content=b'CONTENT', local_filename='foo') def test_workspace_str(self): with TemporaryDirectory() as tempdir: ws1 = self.resolver.workspace_from_nothing(directory=tempdir) ws1.save_mets() ws1.reload_mets() self.assertEqual( str(ws1), 'Workspace[directory=%s, baseurl=None, file_groups=[], files=[]]' % tempdir) def test_workspace_backup(self): with TemporaryDirectory() as tempdir: ws1 = self.resolver.workspace_from_nothing(directory=tempdir) ws1.automatic_backup = True ws1.save_mets() ws1.reload_mets() self.assertEqual( str(ws1), 'Workspace[directory=%s, baseurl=None, file_groups=[], files=[]]' % tempdir) def test_download_url0(self): with TemporaryDirectory() as directory: ws1 = self.resolver.workspace_from_nothing(directory) fn = ws1.download_url(abspath(__file__)) self.assertEqual(fn, join('TEMP', basename(__file__))) def test_download_url_without_baseurl(self): with TemporaryDirectory() as tempdir: dst_mets = join(tempdir, 'mets.xml') copyfile(SRC_METS, dst_mets) ws1 = self.resolver.workspace_from_url(dst_mets) with self.assertRaisesRegex( Exception, "Already tried prepending baseurl '%s'" % tempdir): ws1.download_url(SAMPLE_FILE_URL) def test_download_url_with_baseurl(self): with TemporaryDirectory() as tempdir: dst_mets = join(tempdir, 'mets.xml') copyfile(SRC_METS, dst_mets) ws1 = self.resolver.workspace_from_url( dst_mets, src_baseurl=dirname(SRC_METS)) f = Path(ws1.download_url(SAMPLE_FILE_URL)) self.assertEqual(f, Path('TEMP', '%s.tif' % SAMPLE_FILE_ID)) self.assertTrue(Path(ws1.directory, f).exists()) def test_from_url_dst_dir_download(self): """ https://github.com/OCR-D/core/issues/319 """ with TemporaryDirectory() as tempdir: ws_dir = join(tempdir, 'non-existing-for-good-measure') # Create a relative path to trigger #319 src_path = str( Path(assets.path_to( 'kant_aufklaerung_1784/data/mets.xml')).relative_to( Path.cwd())) self.resolver.workspace_from_url(src_path, dst_dir=ws_dir, download=True) self.assertTrue( Path(ws_dir, 'mets.xml').exists()) # sanity check, mets.xml must exist self.assertTrue( Path(ws_dir, 'OCR-D-GT-PAGE/PAGE_0017_PAGE.xml').exists()) def test_superfluous_copies_in_ws_dir(self): """ https://github.com/OCR-D/core/issues/227 """ def find_recursive(root): ret = [] for _, _, f in walk(root): for file in f: ret.append(file) return ret with TemporaryDirectory() as wsdir: with open( assets.path_to( 'SBB0000F29300010000/data/mets_one_file.xml'), 'r') as f_in: with open(join(wsdir, 'mets.xml'), 'w') as f_out: f_out.write(f_in.read()) self.assertEqual(len(find_recursive(wsdir)), 1) ws1 = Workspace(self.resolver, wsdir) for file in ws1.mets.find_all_files(): ws1.download_file(file) self.assertEqual(len(find_recursive(wsdir)), 2) self.assertTrue( exists(join(wsdir, 'OCR-D-IMG/FILE_0005_IMAGE.tif'))) def test_remove_file_force(self): with copy_of_directory( assets.path_to('SBB0000F29300010000/data')) as tempdir: workspace = Workspace(self.resolver, directory=tempdir) with self.assertRaisesRegex(FileNotFoundError, "not found"): # should fail workspace.remove_file('non-existing-id') # should succeed workspace.remove_file('non-existing-id', force=True) # should also succeed workspace.overwrite_mode = True workspace.remove_file('non-existing-id', force=False) def test_remove_file_remote(self): with TemporaryDirectory() as tempdir: ws = self.resolver.workspace_from_nothing(directory=tempdir) ws.add_file('IMG', ID='page1_img', mimetype='image/tiff', url='http://remote', pageId=None) with self.assertRaisesRegex(Exception, "not locally available"): # should fail ws.remove_file('page1_img') # should succeed ws.remove_file('page1_img', force=True) # should also succeed ws.overwrite_mode = True ws.remove_file('page1_img', force=False) def test_remove_file_group_force(self): with copy_of_directory( assets.path_to('SBB0000F29300010000/data')) as tempdir: workspace = Workspace(self.resolver, directory=tempdir) with self.assertRaisesRegex(Exception, "No such fileGrp"): # should fail workspace.remove_file_group('I DO NOT EXIST') # should succeed workspace.remove_file_group('I DO NOT EXIST', force=True) # should also succeed workspace.overwrite_mode = True workspace.remove_file_group('I DO NOT EXIST', force=False) def test_remove_file_group_rmdir(self): with copy_of_directory( assets.path_to('SBB0000F29300010000/data')) as tempdir: workspace = Workspace(self.resolver, directory=tempdir) self.assertTrue(exists(join(tempdir, 'OCR-D-IMG'))) workspace.remove_file_group('OCR-D-IMG', recursive=True) self.assertFalse(exists(join(tempdir, 'OCR-D-IMG'))) def test_remove_file_page_recursive(self): with copy_of_directory( assets.path_to( 'kant_aufklaerung_1784-complex/data')) as tempdir: with pushd_popd(tempdir): ws = Workspace(self.resolver, directory=tempdir) self.assertEqual(len(ws.mets.find_all_files()), 119) ws.remove_file( 'OCR-D-OCR-OCRO-fraktur-SEG-LINE-tesseract-ocropy-DEWARP_0001', page_recursive=True, page_same_group=False, keep_file=True) self.assertEqual(len(ws.mets.find_all_files()), 83) ws.remove_file('PAGE_0017_ALTO', page_recursive=True) def test_remove_file_page_recursive_keep_file(self): with copy_of_directory( assets.path_to( 'kant_aufklaerung_1784-complex/data')) as tempdir: with pushd_popd(tempdir): ws = Workspace(self.resolver, directory=tempdir) before = count_files() ws.remove_file('OCR-D-IMG-BINPAGE-sauvola_0001', page_recursive=True, page_same_group=False, force=True) after = count_files() self.assertEqual(after, before - 2, '2 files deleted') def test_remove_file_page_recursive_same_group(self): with copy_of_directory( assets.path_to( 'kant_aufklaerung_1784-complex/data')) as tempdir: with pushd_popd(tempdir): ws = Workspace(self.resolver, directory=tempdir) before = count_files() ws.remove_file('OCR-D-IMG-BINPAGE-sauvola_0001', page_recursive=True, page_same_group=True, force=False) after = count_files() self.assertEqual(after, before - 1, '2 file deleted') def test_download_to_directory_from_workspace_download_file(self): """ https://github.com/OCR-D/core/issues/342 """ # tempdir = mkdtemp() with TemporaryDirectory() as tempdir: ws1 = self.resolver.workspace_from_nothing(directory=tempdir) f1 = ws1.add_file('IMG', ID='page1_img', mimetype='image/tiff', local_filename='test.tif', content='', pageId=None) f2 = ws1.add_file('GT', ID='page1_gt', mimetype='text/xml', local_filename='test.xml', content='', pageId=None) self.assertEqual(f1.url, 'test.tif') self.assertEqual(f2.url, 'test.xml') # these should be no-ops ws1.download_file(f1) ws1.download_file(f2) self.assertEqual(f1.url, 'test.tif') self.assertEqual(f2.url, 'test.xml') def test_save_image_file(self): img = Image.new('RGB', (1000, 1000)) with TemporaryDirectory() as tempdir: ws = self.resolver.workspace_from_nothing(directory=tempdir) with self.assertRaisesRegex(KeyError, ''): ws.save_image_file(img, 'page1_img', 'IMG', 'page1', 'ceci/nest/pas/une/mimetype') ws.save_image_file(img, 'page1_img', 'IMG', 'page1', 'image/jpeg') self.assertTrue(exists(join(tempdir, 'IMG', 'page1_img.jpg'))) # should succeed ws.save_image_file(img, 'page1_img', 'IMG', 'page1', 'image/jpeg', force=True) # should also succeed ws.overwrite_mode = True ws.save_image_file(img, 'page1_img', 'IMG', 'page1', 'image/jpeg') def test_resolve_image_exif(self): with pushd_popd(assets.path_to('kant_aufklaerung_1784/data/')): ws = self.resolver.workspace_from_url('mets.xml') exif = ws.resolve_image_exif('OCR-D-IMG/INPUT_0017.tif') self.assertEqual(exif.compression, 'jpeg') self.assertEqual(exif.width, 1457) def test_resolve_image_as_pil(self): with pushd_popd(assets.path_to('kant_aufklaerung_1784/data/')): ws = self.resolver.workspace_from_url('mets.xml') img = ws.resolve_image_as_pil('OCR-D-IMG/INPUT_0017.tif') self.assertEqual(img.width, 1457) img = ws.resolve_image_as_pil('OCR-D-IMG/INPUT_0017.tif', coords=([100, 100], [50, 50])) self.assertEqual(img.width, 50) def test_image_from_page_basic(self): with pushd_popd(assets.path_to('gutachten/data')): ws = self.resolver.workspace_from_url('mets.xml') with open('TEMP1/PAGE_TEMP1.xml', 'r') as f: pcgts = parseString(f.read().encode('utf8'), silence=True) img, info, exif = ws.image_from_page(pcgts.get_Page(), page_id='PHYS_0017', feature_selector='clipped', feature_filter='cropped') self.assertEquals(info['features'], 'binarized,clipped') img, info, exif = ws.image_from_page(pcgts.get_Page(), page_id='PHYS_0017') self.assertEquals(info['features'], 'binarized,clipped') def test_downsample_16bit_image(self): with pushd_popd(tempdir=True) as tempdir: with gzip_open( join(dirname(__file__), 'data/OCR-D-IMG_APBB_Mitteilungen_62.0002.tif.gz'), 'rb') as gzip_in: with open('16bit.tif', 'wb') as tif_out: tif_out.write(gzip_in.read()) ws = self.resolver.workspace_from_nothing(directory=tempdir) ws.add_file('IMG', ID='foo', url='16bit.tif', mimetype='image/tiff', pageId=None) pil_before = Image.open('16bit.tif') assert pil_before.mode == 'I;16' pil_after = ws._resolve_image_as_pil('16bit.tif') assert pil_after.mode == 'L' def test_mets_permissions(self): with TemporaryDirectory() as tempdir: ws = self.resolver.workspace_from_nothing(tempdir) ws.save_mets() mets_path = join(ws.directory, 'mets.xml') mask = umask(0) umask(mask) assert (stat(mets_path).st_mode) == 0o100664 & ~mask chmod(mets_path, 0o777) ws.save_mets() assert filemode(stat(mets_path).st_mode) == '-rwxrwxrwx'
class TestProcessor(TestCase): def setUp(self): disableLogging() initLogging() self.resolver = Resolver() self.workspace = self.resolver.workspace_from_url( assets.url_of('SBB0000F29300010000/data/mets.xml')) def test_incomplete_processor(self): proc = IncompleteProcessor(None) with self.assertRaisesRegex(Exception, 'Must be implemented'): proc.process() def test_no_resolver(self): with self.assertRaisesRegex(Exception, 'pass a resolver to create a workspace'): run_processor(DummyProcessor) def test_no_mets_url(self): with self.assertRaisesRegex(Exception, 'pass mets_url to create a workspace'): run_processor(DummyProcessor, resolver=self.resolver) def test_no_input_file_grp(self): processor = run_processor( DummyProcessor, resolver=self.resolver, mets_url=assets.url_of('SBB0000F29300010000/data/mets.xml')) with self.assertRaisesRegex(Exception, 'Processor is missing input fileGrp'): _ = processor.input_files def test_with_mets_url_input_files(self): processor = run_processor( DummyProcessor, input_file_grp='OCR-D-SEG-PAGE', resolver=self.resolver, mets_url=assets.url_of('SBB0000F29300010000/data/mets.xml')) self.assertEqual(len(processor.input_files), 2) self.assertTrue( all([f.mimetype == MIMETYPE_PAGE for f in processor.input_files])) def test_parameter(self): with TemporaryDirectory() as tempdir: jsonpath = join(tempdir, 'params.json') with open(jsonpath, 'w') as f: f.write('{"baz": "quux"}') with open(jsonpath, 'r') as f: processor = run_processor( DummyProcessor, parameter=json.load(f), input_file_grp="OCR-D-IMG", resolver=self.resolver, mets_url=assets.url_of( 'SBB0000F29300010000/data/mets.xml')) self.assertEqual(len(processor.input_files), 3) def test_verify(self): proc = DummyProcessor(self.workspace) self.assertEqual(proc.verify(), True) def test_json(self): DummyProcessor(self.workspace, dump_json=True) def test_params_missing_required(self): with self.assertRaisesRegex(Exception, 'is a required property'): DummyProcessorWithRequiredParameters(workspace=self.workspace) def test_params(self): proc = Processor(workspace=self.workspace) self.assertEqual(proc.parameter, {}) def test_run_agent(self): no_agents_before = len(self.workspace.mets.agents) run_processor(DummyProcessor, ocrd_tool=DUMMY_TOOL, workspace=self.workspace) self.assertEqual(len(self.workspace.mets.agents), no_agents_before + 1, 'one more agent') # print(self.workspace.mets.agents[no_agents_before]) def test_run_cli(self): with TemporaryDirectory() as tempdir: run_processor(DummyProcessor, ocrd_tool=DUMMY_TOOL, workspace=self.workspace) run_cli( 'echo', mets_url=assets.url_of('SBB0000F29300010000/data/mets.xml'), resolver=Resolver(), workspace=None, page_id='page1', log_level='DEBUG', input_file_grp='INPUT', output_file_grp='OUTPUT', parameter='/path/to/param.json', working_dir=tempdir) run_cli( 'echo', mets_url=assets.url_of('SBB0000F29300010000/data/mets.xml'), resolver=Resolver(), ) def test_zip_input_files(self): class ZipTestProcessor(Processor): pass with pushd_popd(tempdir=True) as tempdir: ws = self.resolver.workspace_from_nothing(directory=tempdir) ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, ID='foobar1', pageId='phys_0001') ws.add_file('GRP2', mimetype='application/alto+xml', ID='foobar2', pageId='phys_0001') ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, ID='foobar3', pageId='phys_0002') ws.add_file('GRP2', mimetype=MIMETYPE_PAGE, ID='foobar4', pageId='phys_0002') for page_id in [None, 'phys_0001,phys_0002']: with self.subTest(page_id=page_id): proc = ZipTestProcessor(workspace=ws, input_file_grp='GRP1,GRP2', page_id=page_id) tuples = [(one.ID, two.ID) for one, two in proc.zip_input_files()] assert ('foobar1', 'foobar2') in tuples assert ('foobar3', 'foobar4') in tuples tuples = [(one.ID, two) for one, two in proc.zip_input_files( mimetype=MIMETYPE_PAGE)] assert ('foobar1', None) in tuples tuples = [( one.ID, two.ID ) for one, two in proc.zip_input_files( mimetype=r'//application/(vnd.prima.page|alto)\+xml')] assert ('foobar1', 'foobar2') in tuples assert ('foobar3', 'foobar4') in tuples def test_zip_input_files_multi_mixed(self): class ZipTestProcessor(Processor): pass with pushd_popd(tempdir=True) as tempdir: ws = self.resolver.workspace_from_nothing(directory=tempdir) ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, ID='foobar1', pageId='phys_0001') ws.add_file('GRP1', mimetype='image/png', ID='foobar1img1', pageId='phys_0001') ws.add_file('GRP1', mimetype='image/png', ID='foobar1img2', pageId='phys_0001') ws.add_file('GRP2', mimetype=MIMETYPE_PAGE, ID='foobar2', pageId='phys_0001') ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, ID='foobar3', pageId='phys_0002') ws.add_file('GRP2', mimetype='image/tiff', ID='foobar4', pageId='phys_0002') for page_id in [None, 'phys_0001,phys_0002']: with self.subTest(page_id=page_id): proc = ZipTestProcessor(workspace=ws, input_file_grp='GRP1,GRP2', page_id=page_id) print("unfiltered") tuples = [(one.ID, two.ID) for one, two in proc.zip_input_files()] assert ('foobar1', 'foobar2') in tuples assert ('foobar3', 'foobar4') in tuples print("PAGE-filtered") tuples = [(one.ID, two) for one, two in proc.zip_input_files( mimetype=MIMETYPE_PAGE)] assert ('foobar3', None) in tuples ws.add_file('GRP2', mimetype='image/tiff', ID='foobar4dup', pageId='phys_0002') for page_id in [None, 'phys_0001,phys_0002']: with self.subTest(page_id=page_id): proc = ZipTestProcessor(workspace=ws, input_file_grp='GRP1,GRP2', page_id=page_id) tuples = [ (one.ID, two.ID) for one, two in proc.zip_input_files(on_error='first') ] assert ('foobar1', 'foobar2') in tuples assert ('foobar3', 'foobar4') in tuples tuples = [ (one.ID, two) for one, two in proc.zip_input_files(on_error='skip') ] assert ('foobar3', None) in tuples with self.assertRaisesRegex( Exception, "No PAGE-XML for page .* in fileGrp .* but multiple matches." ): tuples = proc.zip_input_files(on_error='abort') ws.add_file('GRP2', mimetype=MIMETYPE_PAGE, ID='foobar2dup', pageId='phys_0001') for page_id in [None, 'phys_0001,phys_0002']: with self.subTest(page_id=page_id): proc = ZipTestProcessor(workspace=ws, input_file_grp='GRP1,GRP2', page_id=page_id) with self.assertRaisesRegex( Exception, "Multiple PAGE-XML matches for page"): tuples = proc.zip_input_files() def test_zip_input_files_require_first(self): class ZipTestProcessor(Processor): pass self.capture_out_err() with pushd_popd(tempdir=True) as tempdir: ws = self.resolver.workspace_from_nothing(directory=tempdir) ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, ID='foobar1', pageId=None) ws.add_file('GRP2', mimetype=MIMETYPE_PAGE, ID='foobar2', pageId='phys_0001') for page_id in [None, 'phys_0001,phys_0002']: with self.subTest(page_id=page_id): proc = ZipTestProcessor(workspace=ws, input_file_grp='GRP1,GRP2', page_id=page_id) assert [(one, two.ID) for one, two in proc.zip_input_files( require_first=False)] == [(None, 'foobar2')] r = self.capture_out_err() assert 'ERROR ocrd.processor.base - found no page phys_0001 in file group GRP1' in r.err
class TestWorkspace(TestCase): def setUp(self): self.resolver = Resolver() def test_workspace_add_file(self): with TemporaryDirectory() as tempdir: ws1 = self.resolver.workspace_from_nothing(directory=tempdir) fpath = join(tempdir, 'ID1.tif') ws1.add_file( 'GRP', ID='ID1', mimetype='image/tiff', content='CONTENT', local_filename=fpath ) f = ws1.mets.find_files()[0] self.assertEqual(f.ID, 'ID1') self.assertEqual(f.mimetype, 'image/tiff') self.assertEqual(f.url, fpath) self.assertEqual(f.local_filename, fpath) self.assertTrue(exists(fpath)) def test_workspace_add_file_basename_no_content(self): with TemporaryDirectory() as tempdir: ws1 = self.resolver.workspace_from_nothing(directory=tempdir) ws1.add_file('GRP', ID='ID1', mimetype='image/tiff') f = ws1.mets.find_files()[0] self.assertEqual(f.url, None) def test_workspace_add_file_binary_content(self): with TemporaryDirectory() as tempdir: ws1 = self.resolver.workspace_from_nothing(directory=tempdir) fpath = join(tempdir, 'subdir', 'ID1.tif') ws1.add_file('GRP', ID='ID1', content=b'CONTENT', local_filename=fpath, url='http://foo/bar') self.assertTrue(exists(fpath)) def test_workspacec_add_file_content_wo_local_filename(self): with TemporaryDirectory() as tempdir: ws1 = self.resolver.workspace_from_nothing(directory=tempdir) with self.assertRaisesRegex(Exception, "'content' was set but no 'local_filename'"): ws1.add_file('GRP', ID='ID1', content=b'CONTENT') def test_workspace_str(self): with TemporaryDirectory() as tempdir: ws1 = self.resolver.workspace_from_nothing(directory=tempdir) ws1.save_mets() ws1.reload_mets() self.assertEqual(str(ws1), 'Workspace[directory=%s, baseurl=None, file_groups=[], files=[]]' % tempdir) def test_workspace_backup(self): with TemporaryDirectory() as tempdir: ws1 = self.resolver.workspace_from_nothing(directory=tempdir) ws1.automatic_backup = True ws1.save_mets() ws1.reload_mets() self.assertEqual(str(ws1), 'Workspace[directory=%s, baseurl=None, file_groups=[], files=[]]' % tempdir) def test_download_url0(self): with TemporaryDirectory() as directory: ws1 = self.resolver.workspace_from_nothing(directory) fn = ws1.download_url(abspath(__file__)) self.assertEqual(fn, join('TEMP', basename(__file__))) def test_download_url_without_baseurl(self): with TemporaryDirectory() as tempdir: dst_mets = join(tempdir, 'mets.xml') copyfile(SRC_METS, dst_mets) ws1 = self.resolver.workspace_from_url(dst_mets) with self.assertRaisesRegex(Exception, "Already tried prepending baseurl '%s'" % tempdir): ws1.download_url(SAMPLE_FILE_URL) def test_download_url_with_baseurl(self): with TemporaryDirectory() as tempdir: dst_mets = join(tempdir, 'mets.xml') copyfile(SRC_METS, dst_mets) ws1 = self.resolver.workspace_from_url(dst_mets, src_baseurl=dirname(SRC_METS)) f = Path(ws1.download_url(SAMPLE_FILE_URL)) self.assertEqual(f, Path('TEMP', '%s.tif' % SAMPLE_FILE_ID)) self.assertTrue(Path(ws1.directory, f).exists()) def test_227_1(self): def find_recursive(root): ret = [] for _, _, f in walk(root): for file in f: ret.append(file) return ret with TemporaryDirectory() as wsdir: with open(assets.path_to('SBB0000F29300010000/data/mets_one_file.xml'), 'r') as f_in: with open(join(wsdir, 'mets.xml'), 'w') as f_out: f_out.write(f_in.read()) self.assertEqual(len(find_recursive(wsdir)), 1) ws1 = Workspace(self.resolver, wsdir) for file in ws1.mets.find_files(): ws1.download_file(file) self.assertEqual(len(find_recursive(wsdir)), 2) self.assertTrue(exists(join(wsdir, 'OCR-D-IMG/FILE_0005_IMAGE.tif')))
class TestWorkspace(TestCase): def setUp(self): self.resolver = Resolver() def test_workspace_add_file(self): with TemporaryDirectory() as tempdir: ws1 = self.resolver.workspace_from_nothing(directory=tempdir) fpath = join(tempdir, 'ID1.tif') ws1.add_file('GRP', ID='ID1', mimetype='image/tiff', content='CONTENT', local_filename=fpath) f = ws1.mets.find_files()[0] self.assertEqual(f.ID, 'ID1') self.assertEqual(f.mimetype, 'image/tiff') self.assertEqual(f.url, fpath) self.assertEqual(f.local_filename, fpath) self.assertTrue(exists(fpath)) def test_workspace_add_file_basename_no_content(self): with TemporaryDirectory() as tempdir: ws1 = self.resolver.workspace_from_nothing(directory=tempdir) ws1.add_file('GRP', ID='ID1', mimetype='image/tiff') f = ws1.mets.find_files()[0] self.assertEqual(f.url, None) def test_workspace_add_file_binary_content(self): with TemporaryDirectory() as tempdir: ws1 = self.resolver.workspace_from_nothing(directory=tempdir) fpath = join(tempdir, 'subdir', 'ID1.tif') ws1.add_file('GRP', ID='ID1', content=b'CONTENT', local_filename=fpath, url='http://foo/bar') self.assertTrue(exists(fpath)) def test_workspacec_add_file_content_wo_local_filename(self): with TemporaryDirectory() as tempdir: ws1 = self.resolver.workspace_from_nothing(directory=tempdir) with self.assertRaisesRegex( Exception, "'content' was set but no 'local_filename'"): ws1.add_file('GRP', ID='ID1', content=b'CONTENT') def test_workspace_str(self): with TemporaryDirectory() as tempdir: ws1 = self.resolver.workspace_from_nothing(directory=tempdir) ws1.save_mets() ws1.reload_mets() self.assertEqual( str(ws1), 'Workspace[directory=%s, baseurl=None, file_groups=[], files=[]]' % tempdir) def test_workspace_backup(self): with TemporaryDirectory() as tempdir: ws1 = self.resolver.workspace_from_nothing(directory=tempdir) ws1.automatic_backup = True ws1.save_mets() ws1.reload_mets() self.assertEqual( str(ws1), 'Workspace[directory=%s, baseurl=None, file_groups=[], files=[]]' % tempdir) def test_download_url0(self): with TemporaryDirectory() as directory: ws1 = self.resolver.workspace_from_nothing(directory) fn = ws1.download_url(abspath(__file__)) self.assertEqual(fn, join('TEMP', basename(__file__))) def test_download_url_without_baseurl(self): with TemporaryDirectory() as tempdir: dst_mets = join(tempdir, 'mets.xml') copyfile(SRC_METS, dst_mets) ws1 = self.resolver.workspace_from_url(dst_mets) with self.assertRaisesRegex( Exception, "Already tried prepending baseurl '%s'" % tempdir): ws1.download_url(SAMPLE_FILE_URL) def test_download_url_with_baseurl(self): with TemporaryDirectory() as tempdir: dst_mets = join(tempdir, 'mets.xml') copyfile(SRC_METS, dst_mets) ws1 = self.resolver.workspace_from_url( dst_mets, src_baseurl=dirname(SRC_METS)) f = Path(ws1.download_url(SAMPLE_FILE_URL)) self.assertEqual(f, Path('TEMP', '%s.tif' % SAMPLE_FILE_ID)) self.assertTrue(Path(ws1.directory, f).exists()) def test_from_url_dst_dir_download(self): """ https://github.com/OCR-D/core/issues/319 """ with TemporaryDirectory() as tempdir: ws_dir = join(tempdir, 'non-existing-for-good-measure') # Create a relative path to trigger #319 src_path = str( Path(assets.path_to( 'kant_aufklaerung_1784/data/mets.xml')).relative_to( Path.cwd())) self.resolver.workspace_from_url(src_path, dst_dir=ws_dir, download=True) from os import system system('find %s' % ws_dir) self.assertTrue( Path(ws_dir, 'mets.xml').exists()) # sanity check, mets.xml must exist self.assertTrue( Path(ws_dir, 'OCR-D-GT-PAGE/PAGE_0017_PAGE.xml').exists()) def test_superfluous_copies_in_ws_dir(self): """ https://github.com/OCR-D/core/issues/227 """ def find_recursive(root): ret = [] for _, _, f in walk(root): for file in f: ret.append(file) return ret with TemporaryDirectory() as wsdir: with open( assets.path_to( 'SBB0000F29300010000/data/mets_one_file.xml'), 'r') as f_in: with open(join(wsdir, 'mets.xml'), 'w') as f_out: f_out.write(f_in.read()) self.assertEqual(len(find_recursive(wsdir)), 1) ws1 = Workspace(self.resolver, wsdir) for file in ws1.mets.find_files(): ws1.download_file(file) self.assertEqual(len(find_recursive(wsdir)), 2) self.assertTrue( exists(join(wsdir, 'OCR-D-IMG/FILE_0005_IMAGE.tif'))) # def test_remove(self): # with TemporaryDirectory() as tempdir: # dst_dir = # ws1 = self.resolver.workspace_from_url(SRC_METS, dst_dir=dst_dir) # res = ws1.download_url(SAMPLE_FILE_URL) # print('>>>>>> %s' % res) # ocrd_file = ws1.remove_file(SAMPLE_FILE_ID) # print(ocrd_file) # import os # self.assertTrue(exists(join(ws1.directory, ocrd_file.local_filename))) # # with copy_of_directory(FOLDER_KANT) as tempdir: def test_download_to_directory_from_workspace_download_file(self): """ https://github.com/OCR-D/core/issues/342 """ # tempdir = mkdtemp() with TemporaryDirectory() as tempdir: ws1 = self.resolver.workspace_from_nothing(directory=tempdir) f1 = ws1.add_file('IMG', ID='page1_img', mimetype='image/tiff', local_filename='test.tif', content='') f2 = ws1.add_file('GT', ID='page1_gt', mimetype='text/xml', local_filename='test.xml', content='') self.assertEqual(f1.url, 'test.tif') self.assertEqual(f2.url, 'test.xml') # these should be no-ops ws1.download_file(f1) ws1.download_file(f2) self.assertEqual(f1.url, 'test.tif') self.assertEqual(f2.url, 'test.xml')
class TestWorkspaceValidator(TestCase): def setUp(self): self.resolver = Resolver() def test_simple(self): report = WorkspaceValidator.validate( self.resolver, assets.url_of('SBB0000F29300010000/data/mets_one_file.xml'), download=True) self.assertTrue(report.is_valid) def test_validate_twice(self): validator = WorkspaceValidator( self.resolver, assets.url_of('SBB0000F29300010000/data/mets_one_file.xml'), download=True) report = validator._validate() # pylint: disable=protected-access report = validator._validate() # pylint: disable=protected-access self.assertTrue(report.is_valid) def test_validate_empty(self): with TemporaryDirectory() as tempdir: workspace = self.resolver.workspace_from_nothing(directory=tempdir) report = WorkspaceValidator.validate(self.resolver, join(tempdir, 'mets.xml')) self.assertEqual(len(report.errors), 2) self.assertIn('no unique identifier', report.errors[0]) self.assertIn('No files', report.errors[1]) workspace.mets.unique_identifier = 'foobar' workspace.save_mets() report = WorkspaceValidator.validate(self.resolver, join(tempdir, 'mets.xml')) self.assertEqual(len(report.errors), 1) def test_validate_file_groups_non_ocrd(self): with TemporaryDirectory() as tempdir: workspace = self.resolver.workspace_from_nothing(directory=tempdir) workspace.mets.unique_identifier = 'foobar' workspace.mets.add_file_group('FOO') workspace.save_mets() report = WorkspaceValidator.validate(self.resolver, join(tempdir, 'mets.xml')) self.assertEqual(len(report.errors), 1) self.assertIn('No files', report.errors[0]) self.assertEqual(len(report.notices), 1) self.assertIn("USE does not begin with 'OCR-D-'", report.notices[0]) def test_validate_file_groups_unspecified(self): with TemporaryDirectory() as tempdir: workspace = self.resolver.workspace_from_nothing(directory=tempdir) workspace.mets.unique_identifier = 'foobar' workspace.mets.add_file_group('OCR-D-INVALID-FILEGRP') workspace.save_mets() report = WorkspaceValidator.validate(self.resolver, join(tempdir, 'mets.xml')) self.assertEqual(len(report.errors), 1) self.assertEqual(len(report.warnings), 1) self.assertEqual( report.warnings[0], "Unspecified USE category 'INVALID' in fileGrp 'OCR-D-INVALID-FILEGRP'" ) self.assertIn('No files', report.errors[0]) def test_validate_file_groups_bad_name(self): with TemporaryDirectory() as tempdir: workspace = self.resolver.workspace_from_nothing(directory=tempdir) workspace.mets.unique_identifier = 'foobar' workspace.mets.add_file_group('OCR-D-GT-X') workspace.save_mets() report = WorkspaceValidator.validate(self.resolver, join(tempdir, 'mets.xml')) self.assertEqual(len(report.errors), 1) self.assertEqual(len(report.warnings), 1) self.assertIn("Invalid USE name 'X' in fileGrp", report.warnings[0]) self.assertIn('No files', report.errors[0]) def test_validate_files_nopageid(self): with TemporaryDirectory() as tempdir: workspace = self.resolver.workspace_from_nothing(directory=tempdir) workspace.mets.unique_identifier = 'foobar' workspace.mets.add_file('OCR-D-GT-PAGE', ID='file1', mimetype='image/png', url='http://foo') workspace.save_mets() report = WorkspaceValidator.validate( self.resolver, join(tempdir, 'mets.xml'), skip=['pixel_density', 'imagefilename']) self.assertEqual(len(report.errors), 1) self.assertIn("does not manifest any physical page.", report.errors[0]) def test_validate_weird_urls(self): with TemporaryDirectory() as tempdir: workspace = self.resolver.workspace_from_nothing(directory=tempdir) workspace.mets.unique_identifier = 'foobar' workspace.mets.add_file('OCR-D-GT-PAGE', ID='file1', mimetype='image/png', pageId='page1', url='file:/java-file-url') f = workspace.mets.add_file('OCR-D-GT-PAGE', ID='file2', mimetype='image/png', pageId='page2', url='nothttp://unusual.scheme') f._el.set('GROUPID', 'donotuse') # pylint: disable=protected-access workspace.save_mets() report = WorkspaceValidator.validate(self.resolver, join(tempdir, 'mets.xml'), skip=['pixel_density']) self.assertEqual(len(report.errors), 1) self.assertIn("Invalid (java) URL", report.errors[0]) def test_validate_pixel_no_download(self): imgpath = assets.path_to( 'kant_aufklaerung_1784-binarized/data/OCR-D-IMG-BIN/BIN_0020.png') with TemporaryDirectory() as tempdir: workspace = self.resolver.workspace_from_nothing(directory=tempdir) workspace.mets.unique_identifier = 'foobar' workspace.mets.add_file('OCR-D-GT-BIN', ID='file1', mimetype='image/png', pageId='page1', url=imgpath) workspace.save_mets() report = WorkspaceValidator.validate(self.resolver, join(tempdir, 'mets.xml'), skip=[], download=False) self.assertEqual(len(report.errors), 0) self.assertEqual(len(report.warnings), 0) self.assertEqual(len(report.notices), 0) def test_validate_pixel_density_too_low(self): imgpath = assets.path_to( 'kant_aufklaerung_1784-binarized/data/OCR-D-IMG-BIN/BIN_0017.png') with TemporaryDirectory() as tempdir: workspace = self.resolver.workspace_from_nothing(directory=tempdir) workspace.mets.unique_identifier = 'foobar' workspace.mets.add_file('OCR-D-GT-BIN', ID='file1', mimetype='image/png', pageId='page1', url=imgpath) workspace.save_mets() report = WorkspaceValidator.validate(self.resolver, join(tempdir, 'mets.xml'), skip=[], download=True) self.assertEqual(len(report.notices), 2) self.assertIn("xResolution", report.notices[0]) self.assertIn("yResolution", report.notices[1]) self.assertEqual(len(report.warnings), 0) self.assertEqual(len(report.errors), 0) def test_bad_workspace(self): report = WorkspaceValidator.validate(self.resolver, 'non existe') self.assertFalse(report.is_valid) self.assertIn('Failed to instantiate workspace:', report.errors[0]) def test_skip_page(self): report = WorkspaceValidator.validate( self.resolver, None, src_dir=assets.path_to('kant_aufklaerung_1784/data'), download=True, skip=[ 'page', 'mets_unique_identifier', 'mets_file_group_names', 'mets_files', 'pixel_density', 'imagefilename', ]) self.assertTrue(report.is_valid) def test_dimensions(self): with TemporaryDirectory() as tempdir: wsdir = join(tempdir, 'foo') copytree(assets.path_to('kant_aufklaerung_1784/data'), wsdir) with pushd_popd(wsdir): os.system( """sed -i 's,imageHeight="2083",imageHeight="1234",' OCR-D-GT-PAGE/PAGE_0017_PAGE.xml""" ) report = WorkspaceValidator.validate( self.resolver, join(wsdir, 'mets.xml'), src_dir=wsdir, skip=[ 'page', 'mets_unique_identifier', 'mets_file_group_names', 'mets_files', 'pixel_density', 'imagefilename' ], download=False) self.assertIn( "PAGE 'PAGE_0017_PAGE': @imageHeight != image's actual height (1234 != 2083)", report.errors) print(report.errors) self.assertEqual(len(report.errors), 1) self.assertEqual(report.is_valid, False) report2 = WorkspaceValidator.validate( self.resolver, join(wsdir, 'mets.xml'), src_dir=wsdir, skip=[ 'page', 'mets_unique_identifier', 'mets_file_group_names', 'mets_files', 'pixel_density', 'imagefilename', 'dimension' ], download=False) self.assertEqual(report2.is_valid, True) def test_src_dir(self): report = WorkspaceValidator.validate( self.resolver, None, src_dir=assets.path_to('kant_aufklaerung_1784/data'), skip=['imagefilename'], download=True, ) print(report.errors) self.assertEqual( len([e for e in report.errors if isinstance(e, ConsistencyError)]), 42, '42 textequiv consistency errors') def test_imagefilename(self): report = WorkspaceValidator.validate( self.resolver, None, src_dir=assets.path_to('kant_aufklaerung_1784/data'), skip=[ 'page', 'mets_unique_identifier', 'mets_file_group_names', 'mets_files', 'pixel_density' ], download=False, ) self.assertEqual(len(report.errors), 0)
class TestWorkspaceValidator(TestCase): def setUp(self): self.resolver = Resolver() def test_simple(self): report = WorkspaceValidator.validate( self.resolver, assets.url_of('SBB0000F29300010000/data/mets_one_file.xml'), download=True) self.assertTrue(report.is_valid) def test_validate_twice(self): validator = WorkspaceValidator( self.resolver, assets.url_of('SBB0000F29300010000/data/mets_one_file.xml'), download=True) report = validator._validate() # pylint: disable=protected-access report = validator._validate() # pylint: disable=protected-access self.assertTrue(report.is_valid) def test_validate_empty(self): with TemporaryDirectory() as tempdir: workspace = self.resolver.workspace_from_nothing(directory=tempdir) report = WorkspaceValidator.validate(self.resolver, join(tempdir, 'mets.xml')) self.assertEqual(len(report.errors), 2) self.assertIn('no unique identifier', report.errors[0]) self.assertIn('No files', report.errors[1]) workspace.mets.unique_identifier = 'foobar' workspace.save_mets() report = WorkspaceValidator.validate(self.resolver, join(tempdir, 'mets.xml')) self.assertEqual(len(report.errors), 1) def test_validate_file_groups_non_ocrd(self): with TemporaryDirectory() as tempdir: workspace = self.resolver.workspace_from_nothing(directory=tempdir) workspace.mets.unique_identifier = 'foobar' workspace.mets.add_file_group('FOO') workspace.save_mets() report = WorkspaceValidator.validate(self.resolver, join(tempdir, 'mets.xml')) self.assertEqual(len(report.errors), 1) self.assertIn('No files', report.errors[0]) self.assertEqual(len(report.notices), 1) self.assertIn("USE does not begin with 'OCR-D-'", report.notices[0]) def test_validate_file_groups_unspecified(self): with TemporaryDirectory() as tempdir: workspace = self.resolver.workspace_from_nothing(directory=tempdir) workspace.mets.unique_identifier = 'foobar' workspace.mets.add_file_group('OCR-D-INVALID-FILEGRP') workspace.save_mets() report = WorkspaceValidator.validate(self.resolver, join(tempdir, 'mets.xml')) self.assertEqual(len(report.errors), 2) self.assertEqual( report.errors[0], "Unspecified USE category 'INVALID' in fileGrp 'OCR-D-INVALID-FILEGRP'" ) self.assertIn('No files', report.errors[1]) def test_validate_file_groups_bad_name(self): with TemporaryDirectory() as tempdir: workspace = self.resolver.workspace_from_nothing(directory=tempdir) workspace.mets.unique_identifier = 'foobar' workspace.mets.add_file_group('OCR-D-GT-X') workspace.save_mets() report = WorkspaceValidator.validate(self.resolver, join(tempdir, 'mets.xml')) self.assertEqual(len(report.errors), 2) self.assertIn("Invalid USE name 'X' in fileGrp", report.errors[0]) self.assertIn('No files', report.errors[1]) def test_validate_files_nopageid(self): with TemporaryDirectory() as tempdir: workspace = self.resolver.workspace_from_nothing(directory=tempdir) workspace.mets.unique_identifier = 'foobar' workspace.mets.add_file('OCR-D-GT-PAGE', ID='file1', mimetype='image/png') workspace.save_mets() report = WorkspaceValidator.validate(self.resolver, join(tempdir, 'mets.xml'), skip=['pixel_density']) self.assertEqual(len(report.errors), 1) self.assertIn("does not manifest any physical page.", report.errors[0]) def test_validate_weird_urls(self): with TemporaryDirectory() as tempdir: workspace = self.resolver.workspace_from_nothing(directory=tempdir) workspace.mets.unique_identifier = 'foobar' workspace.mets.add_file('OCR-D-GT-PAGE', ID='file1', mimetype='image/png', pageId='page1', url='file:/java-file-url') f = workspace.mets.add_file('OCR-D-GT-PAGE', ID='file2', mimetype='image/png', pageId='page2', url='nothttp://unusual.scheme') f._el.set('GROUPID', 'donotuse') # pylint: disable=protected-access workspace.save_mets() report = WorkspaceValidator.validate(self.resolver, join(tempdir, 'mets.xml'), skip=['pixel_density']) self.assertEqual(len(report.errors), 0) self.assertEqual(len(report.warnings), 2) self.assertIn("Java-specific", report.warnings[0]) self.assertIn("non-HTTP", report.warnings[1]) self.assertEqual(len(report.notices), 1) self.assertIn("has GROUPID attribute", report.notices[0]) def test_validate_pixel_no_download(self): imgpath = assets.path_to( 'kant_aufklaerung_1784-binarized/data/OCR-D-IMG-BIN/BIN_0020') with TemporaryDirectory() as tempdir: workspace = self.resolver.workspace_from_nothing(directory=tempdir) workspace.mets.unique_identifier = 'foobar' workspace.mets.add_file('OCR-D-GT-BIN', ID='file1', mimetype='image/png', pageId='page1', url='file://%s' % imgpath) workspace.save_mets() report = WorkspaceValidator.validate(self.resolver, join(tempdir, 'mets.xml'), skip=[]) self.assertEqual(len(report.errors), 0) self.assertEqual(len(report.warnings), 0) self.assertEqual(len(report.notices), 1) self.assertIn("Won't download remote image", report.notices[0]) def test_validate_pixel_density_too_low(self): imgpath = assets.path_to( 'kant_aufklaerung_1784-binarized/data/OCR-D-IMG-BIN/BIN_0017') with TemporaryDirectory() as tempdir: workspace = self.resolver.workspace_from_nothing(directory=tempdir) workspace.mets.unique_identifier = 'foobar' workspace.mets.add_file('OCR-D-GT-BIN', ID='file1', mimetype='image/png', pageId='page1', url='file://%s' % imgpath) workspace.save_mets() report = WorkspaceValidator.validate(self.resolver, join(tempdir, 'mets.xml'), skip=[], download=True) self.assertEqual(len(report.errors), 2) self.assertIn("xResolution", report.errors[0]) self.assertIn("yResolution", report.errors[1]) self.assertEqual(len(report.warnings), 0) self.assertEqual(len(report.notices), 0) def test_bad_workspace(self): report = WorkspaceValidator.validate(self.resolver, 'non existe') self.assertFalse(report.is_valid) self.assertIn('Failed to instantiate workspace:', report.errors[0]) def test_skip_page(self): report = WorkspaceValidator.validate( self.resolver, None, src_dir=assets.path_to('kant_aufklaerung_1784/data'), download=True, skip=[ 'page', 'mets_unique_identifier', 'mets_file_group_names', 'mets_files', 'pixel_density', ]) self.assertTrue(report.is_valid) def test_src_dir(self): report = WorkspaceValidator.validate( self.resolver, None, src_dir=assets.path_to('kant_aufklaerung_1784/data'), download=True, ) self.assertEqual(len(report.errors), 42)