def test_get_mime_types(self): doc = Document.load(self.path) self.assertEqual( { 'application/vnd.prima.page+xml', 'image/tiff', 'application/alto+xml' }, doc.mime_types)
def test_path_with_spaces(self): doc = Document.load( (TEST_BASE_PATH / 'example/workspaces/heavy quoting/mets.xml').as_uri()) page = doc.page_for_id('PHYS_0017', 'OCR-D-GT-PAGE') image = doc.workspace.image_from_page(page.page, 'PHYS_0017') # Assert no exceptions happened and a sensible return value self.assertGreater(image[0].height, 100)
def test_missing_image(self): path = TEST_BASE_PATH / 'example/workspaces/kant_aufklaerung_1784_missing_image/mets.xml' uri = path.as_uri() doc = Document.load(uri) page = doc.page_for_id('PHYS_0017', 'OCR-D-GT-PAGE') image, info, exif = page.get_image(feature_selector='', feature_filter='binarized') # Assert no exceptions happened and no image returned self.assertIsNone(image)
def test_path_ocrd_file(self): doc = Document.load(self.path) image_file = list( doc.workspace.mets.find_files(pageId='PHYS_0017', fileGrp='OCR-D-IMG'))[0] self.assertEqual( ASSETS_PATH / 'kant_aufklaerung_1784/data/OCR-D-IMG/INPUT_0017.tif', doc.path(image_file))
def _open(self, uri: str) -> None: # noinspection PyTypeChecker self.document = Document.load(uri, emitter=self.emit) self.page_list.set_document(self.document) self.view_manager.set_document(self.document) self.update_ui() if len(self.document.page_ids): self.on_page_activated(None, self.document.page_ids[0])
def test_page_for_id_with_multiple_images_for_page_and_fileGrp(self): """ returns first image and warns """ doc = Document.load(ASSETS_PATH / 'kant_aufklaerung_1784-complex/data/mets.xml') # with self.assertLogs('ocrd_browser.model.document', level='WARNING') as log_watch: page = doc.page_for_id('PHYS_0017', 'OCR-D-IMG-CLIP') self.assertIsInstance(page, Page) self.assertIsInstance(page.pc_gts, PcGtsType)
def test_can_call_get_image_if_supported(self): page: LazyPage = Document.load(ASSETS_PATH / 'kant_aufklaerung_1784-binarized/data/mets.xml').page_for_id('P_0017', 'OCR-D-GT-WORD') if IMAGE_FROM_PAGE_FILENAME_SUPPORT: image_by_feature, _, _ = page.get_image(feature_selector={'binarized'}, feature_filter={'cropped'}) image_by_filename, _, _ = page.get_image(filename='OCR-D-IMG-BIN/BIN_0017.png', feature_filter={'cropped'}) self.assertEqual(image_by_feature, image_by_filename) else: try: page.get_image(filename='OCR-D-GT-IMG-BIN/PAGE_2019.tif') self.fail('IMAGE_FROM_PAGE_FILENAME_SUPPORT detected wrong') except RuntimeError as e: self.assertTrue(str(e).startswith('Parameter filename not supported in '))
def test_get_page_index(self): doc = Document.load(self.path) file_index = doc.get_file_index() page17 = [ file for file in file_index.values() if file.static_page_id == 'PHYS_0017' ] alto = [ file for file in file_index.values() if file.mimetype == 'application/alto+xml' ] self.assertEqual(3, len(page17)) self.assertEqual(2, len(alto))
def test_page_for_id_with_multiple_images_for_page_and_fileGrp(self): """ returns first image and warns """ doc = Document.load(ASSETS_PATH / 'kant_aufklaerung_1784-complex/data/mets.xml') with self.assertLogs('ocrd_browser.model.document', level='WARNING') as log_watch: page = doc.page_for_id('PHYS_0017', 'OCR-D-IMG-CLIP') self.assertIsInstance(page, Page) self.assertEqual(1, len(log_watch.records)) self.assertEqual( "No PAGE-XML but 2 images for page 'PHYS_0017' in fileGrp 'OCR-D-IMG-CLIP'", log_watch.records[0].msg)
def test_page_for_id_with_no_images_for_page_and_fileGrp(self): """ Issue #4: list index out of range on non-XML fileGrp https://github.com/hnesk/browse-ocrd/issues/4 """ doc = Document.load(ASSETS_PATH / 'kant_aufklaerung_1784-complex/data/mets.xml') with self.assertLogs('ocrd_browser.model.document', level='WARNING') as log_watch: page = doc.page_for_id('PHYS_0020', 'OCR-D-IMG-CLIP') self.assertIsNone(page) self.assertEqual(1, len(log_watch.records)) self.assertEqual( "No PAGE-XML and no image for page 'PHYS_0020' in fileGrp 'OCR-D-IMG-CLIP'", log_watch.records[0].msg)
def test_page_for_id_with_nothing_for_page_and_fileGrp(self): """ Issue #4 again: This time for missing PAGE-XMLs https://github.com/hnesk/browse-ocrd/issues/4 """ doc = Document.load( ASSETS_PATH / '../example/workspaces/kant_aufklaerung_1784_missing_xml/mets.xml') with self.assertLogs('ocrd_browser.model.document', level='WARNING') as log_watch: page = doc.page_for_id('PHYS_0020', 'OCR-D-GT-PAGE') self.assertIsNone(page) self.assertEqual(1, len(log_watch.records)) self.assertEqual( "No PAGE-XML and no image for page 'PHYS_0020' in fileGrp 'OCR-D-GT-PAGE'", log_watch.records[0].msg)
def test_save(self): doc = Document.clone(self.path) with TemporaryDirectory(prefix='browse-ocrd-tests') as directory: saved_mets = directory + '/mets.xml' doc.save_as(saved_mets) saved = Document.load(saved_mets) self.assertEqual(doc.file_groups, saved.file_groups) self.assertEqual(doc.page_ids, saved.page_ids) self.assertEqual(doc.workspace.mets.unique_identifier, saved.workspace.mets.unique_identifier) for page_id in doc.page_ids: for file_group, mime in doc.file_groups_and_mimetypes: original_file = doc.files_for_page_id( page_id, file_group, mime)[0] saved_file = saved.files_for_page_id( page_id, file_group, mime)[0] self.assertEqual(original_file, saved_file)
def test_modify_when_not_editable(self): doc = Document.load(self.path) with self.assertRaises(PermissionError): doc.reorder(['PHYS_0020', 'PHYS_0017'])
def test_path_path(self): doc = Document.load(self.path) self.assertEqual( ASSETS_PATH / 'kant_aufklaerung_1784/data/OCR-D-DIR/lala.xml', doc.path(Path('OCR-D-DIR/lala.xml')))
def test_path_string(self): doc = Document.load(self.path) self.assertEqual(ASSETS_PATH / 'kant_aufklaerung_1784/data/lala.xml', doc.path('lala.xml'))
def test_get_default_image_group_with_missing_ocr_d_img(self): doc = Document.load( ASSETS_PATH / '../example/workspaces/no_ocrd_d_img_group/mets.xml') file_group = doc.get_default_image_group() self.assertEqual('OCR-D-IMG-PNG', file_group)
def test_get_default_image_group_no_preference(self): doc = Document.load(ASSETS_PATH / 'kant_aufklaerung_1784-complex/data/mets.xml') file_group = doc.get_default_image_group() self.assertEqual('OCR-D-IMG', file_group)
def test_get_default_image_group(self): doc = Document.load(ASSETS_PATH / 'kant_aufklaerung_1784-complex/data/mets.xml') file_group = doc.get_default_image_group( ['OCR-D-IMG-BIN', 'OCR-D-IMG.*']) self.assertEqual('OCR-D-IMG-BIN', file_group)
def test_get_image_paths(self): doc = Document.load(self.path) image_paths = doc.get_image_paths('OCR-D-IMG') self.assertEqual(2, len(image_paths)) self.assertEqual('INPUT_0017.tif', image_paths['PHYS_0017'].name) self.assertEqual('INPUT_0020.tif', image_paths['PHYS_0020'].name)
def test_get_file_groups_and_mimetypes(self): doc = Document.load(self.path) expected = [('OCR-D-IMG', 'image/tiff'), ('OCR-D-GT-PAGE', 'application/vnd.prima.page+xml'), ('OCR-D-GT-ALTO', 'application/alto+xml')] self.assertEqual(expected, doc.file_groups_and_mimetypes)
def test_xpath_works_with_different_namespaces(self): doc = Document.load(TEST_BASE_PATH / 'example/workspaces/aletheiaexamplepage/mets.xml') for page_id in ['PAGE_2017', 'PAGE_2018', 'PAGE_2019']: page = doc.page_for_id(page_id, 'OCR-D-GT-PAGE') xpath_result = page.xpath('/page:PcGts/page:Page/@imageFilename') self.assertGreater(len(xpath_result), 0)
def setUp(self) -> None: self.launcher = Launcher() self.doc = Document.load(BASE_PATH / 'mets.xml') self.file = self.doc.files_for_page_id('PHYS_0017', 'OCR-D-GT-PAGE')[0]
def test_get_page_ids(self): doc = Document.load(self.path) self.assertEqual(['PHYS_0017', 'PHYS_0020'], doc.page_ids)