Exemplo n.º 1
0
def _fixture_plain_workspace(tmp_path):
    resolver = Resolver()
    ws = resolver.workspace_from_nothing(directory=tmp_path)
    prev_dir = abspath(curdir)
    chdir(tmp_path)
    yield ws
    chdir(prev_dir)
Exemplo n.º 2
0
class TestWorkspace(TestCase):
    def setUp(self):
        self.resolver = Resolver()

    def test_workspace_add_file(self):
        with TemporaryDirectory() as tempdir:
            ws1 = self.resolver.workspace_from_nothing(directory=tempdir)
            fpath = join(tempdir, 'ID1.tif')
            ws1.add_file('GRP',
                         ID='ID1',
                         mimetype='image/tiff',
                         content='CONTENT',
                         local_filename=fpath)
            f = ws1.mets.find_files()[0]
            self.assertEqual(f.ID, 'ID1')
            self.assertEqual(f.mimetype, 'image/tiff')
            self.assertEqual(f.url, fpath)
            self.assertEqual(f.local_filename, fpath)
            self.assertTrue(exists(fpath))

    def test_workspace_add_file_basename_no_content(self):
        with TemporaryDirectory() as tempdir:
            ws1 = self.resolver.workspace_from_nothing(directory=tempdir)
            ws1.add_file('GRP', ID='ID1', mimetype='image/tiff')
            f = ws1.mets.find_files()[0]
            self.assertEqual(f.url, None)

    def test_workspace_add_file_binary_content(self):
        with TemporaryDirectory() as tempdir:
            ws1 = self.resolver.workspace_from_nothing(directory=tempdir)
            fpath = join(tempdir, 'subdir', 'ID1.tif')
            ws1.add_file('GRP',
                         ID='ID1',
                         content=b'CONTENT',
                         local_filename=fpath,
                         url='http://foo/bar')
            self.assertTrue(exists(fpath))

    def test_workspacec_add_file_content_wo_local_filename(self):
        with TemporaryDirectory() as tempdir:
            ws1 = self.resolver.workspace_from_nothing(directory=tempdir)
            with self.assertRaisesRegex(
                    Exception, "'content' was set but no 'local_filename'"):
                ws1.add_file('GRP', ID='ID1', content=b'CONTENT')

    def test_workspace_str(self):
        with TemporaryDirectory() as tempdir:
            ws1 = self.resolver.workspace_from_nothing(directory=tempdir)
            ws1.save_mets()
            ws1.reload_mets()
            self.assertEqual(
                str(ws1),
                'Workspace[directory=%s, baseurl=None, file_groups=[], files=[]]'
                % tempdir)

    def test_workspace_backup(self):
        with TemporaryDirectory() as tempdir:
            ws1 = self.resolver.workspace_from_nothing(directory=tempdir)
            ws1.automatic_backup = True
            ws1.save_mets()
            ws1.reload_mets()
            self.assertEqual(
                str(ws1),
                'Workspace[directory=%s, baseurl=None, file_groups=[], files=[]]'
                % tempdir)

    def test_download_url0(self):
        with TemporaryDirectory() as directory:
            ws1 = self.resolver.workspace_from_nothing(directory)
            fn = ws1.download_url(abspath(__file__))
            self.assertEqual(fn, join('TEMP', basename(__file__)))

    def test_download_url_without_baseurl(self):
        with TemporaryDirectory() as tempdir:
            dst_mets = join(tempdir, 'mets.xml')
            copyfile(SRC_METS, dst_mets)
            ws1 = self.resolver.workspace_from_url(dst_mets)
            with self.assertRaisesRegex(
                    Exception,
                    "Already tried prepending baseurl '%s'" % tempdir):
                ws1.download_url(SAMPLE_FILE_URL)

    def test_download_url_with_baseurl(self):
        with TemporaryDirectory() as tempdir:
            dst_mets = join(tempdir, 'mets.xml')
            copyfile(SRC_METS, dst_mets)
            ws1 = self.resolver.workspace_from_url(
                dst_mets, src_baseurl=dirname(SRC_METS))
            f = Path(ws1.download_url(SAMPLE_FILE_URL))
            self.assertEqual(f, Path('TEMP', '%s.tif' % SAMPLE_FILE_ID))
            self.assertTrue(Path(ws1.directory, f).exists())

    def test_from_url_dst_dir_download(self):
        """
        https://github.com/OCR-D/core/issues/319
        """
        with TemporaryDirectory() as tempdir:
            ws_dir = join(tempdir, 'non-existing-for-good-measure')
            # Create a relative path to trigger #319
            src_path = str(
                Path(assets.path_to(
                    'kant_aufklaerung_1784/data/mets.xml')).relative_to(
                        Path.cwd()))
            self.resolver.workspace_from_url(src_path,
                                             dst_dir=ws_dir,
                                             download=True)
            self.assertTrue(
                Path(ws_dir,
                     'mets.xml').exists())  # sanity check, mets.xml must exist
            self.assertTrue(
                Path(ws_dir, 'OCR-D-GT-PAGE/PAGE_0017_PAGE.xml').exists())

    def test_superfluous_copies_in_ws_dir(self):
        """
        https://github.com/OCR-D/core/issues/227
        """
        def find_recursive(root):
            ret = []
            for _, _, f in walk(root):
                for file in f:
                    ret.append(file)
            return ret

        with TemporaryDirectory() as wsdir:
            with open(
                    assets.path_to(
                        'SBB0000F29300010000/data/mets_one_file.xml'),
                    'r') as f_in:
                with open(join(wsdir, 'mets.xml'), 'w') as f_out:
                    f_out.write(f_in.read())
            self.assertEqual(len(find_recursive(wsdir)), 1)
            ws1 = Workspace(self.resolver, wsdir)
            for file in ws1.mets.find_files():
                ws1.download_file(file)
            self.assertEqual(len(find_recursive(wsdir)), 2)
            self.assertTrue(
                exists(join(wsdir, 'OCR-D-IMG/FILE_0005_IMAGE.tif')))

    def test_remove_file_force(self):
        with copy_of_directory(
                assets.path_to('SBB0000F29300010000/data')) as tempdir:
            workspace = Workspace(self.resolver, directory=tempdir)
            with self.assertRaisesRegex(FileNotFoundError, "not found"):
                # should fail
                workspace.remove_file('non-existing-id')
            # should succeed
            workspace.remove_file('non-existing-id', force=True)

    def test_remove_file_remote(self):
        with TemporaryDirectory() as tempdir:
            ws = self.resolver.workspace_from_nothing(directory=tempdir)
            ws.add_file('IMG',
                        ID='page1_img',
                        mimetype='image/tiff',
                        url='http://remote')
            with self.assertRaisesRegex(Exception, "not locally available"):
                # should fail
                ws.remove_file('page1_img')
            # should succeed
            ws.remove_file('page1_img', force=True)

    def test_remove_file_group_force(self):
        with copy_of_directory(
                assets.path_to('SBB0000F29300010000/data')) as tempdir:
            workspace = Workspace(self.resolver, directory=tempdir)
            with self.assertRaisesRegex(Exception, "No such fileGrp"):
                # raise error unless force
                workspace.remove_file_group('I DO NOT EXIST')
            # no error
            workspace.remove_file_group('I DO NOT EXIST', force=True)

    def test_remove_file_group_rmdir(self):
        with copy_of_directory(
                assets.path_to('SBB0000F29300010000/data')) as tempdir:
            workspace = Workspace(self.resolver, directory=tempdir)
            self.assertTrue(exists(join(tempdir, 'OCR-D-IMG')))
            workspace.remove_file_group('OCR-D-IMG', recursive=True)
            self.assertFalse(exists(join(tempdir, 'OCR-D-IMG')))

    def test_download_to_directory_from_workspace_download_file(self):
        """
        https://github.com/OCR-D/core/issues/342
        """
        #  tempdir = mkdtemp()
        with TemporaryDirectory() as tempdir:
            ws1 = self.resolver.workspace_from_nothing(directory=tempdir)

            f1 = ws1.add_file('IMG',
                              ID='page1_img',
                              mimetype='image/tiff',
                              local_filename='test.tif',
                              content='')
            f2 = ws1.add_file('GT',
                              ID='page1_gt',
                              mimetype='text/xml',
                              local_filename='test.xml',
                              content='')

            self.assertEqual(f1.url, 'test.tif')
            self.assertEqual(f2.url, 'test.xml')

            # these should be no-ops
            ws1.download_file(f1)
            ws1.download_file(f2)

            self.assertEqual(f1.url, 'test.tif')
            self.assertEqual(f2.url, 'test.xml')

    def test_save_image_file(self):
        from PIL import Image
        img = Image.new('RGB', (1000, 1000))
        with TemporaryDirectory() as tempdir:
            ws = self.resolver.workspace_from_nothing(directory=tempdir)
            with self.assertRaisesRegex(KeyError, ''):
                ws.save_image_file(img, 'page1_img', 'IMG', 'page1',
                                   'ceci/nest/pas/une/mimetype')
            ws.save_image_file(img, 'page1_img', 'IMG', 'page1', 'image/jpeg')
            self.assertTrue(exists(join(tempdir, 'IMG', 'page1_img.jpg')))
Exemplo n.º 3
0
def convert(cocofile, directory):
    """Convert MS-COCO JSON to METS/PAGE XML files.
    
    Load JSON ``cocofile`` (in MS-COCO format)
    and chdir to ``directory`` (which it refers to).
    
    Start a METS file mets.xml with references to
    the image files (under fileGrp ``OCR-D-IMG``)
    and their corresponding PAGE-XML annotations
    (under fileGrp ``OCR-D-GT-SEG-BLOCK``), as
    parsed from ``cocofile`` and written using
    the same basename.
    """
    resolver = Resolver()
    with pushd_popd(directory):
        workspace = resolver.workspace_from_nothing('.')
        # https://github.com/ibm-aur-nlp/PubLayNet
        workspace.mets.unique_identifier = 'ocrd_PubLayNet_' + directory
        coco = json.load(cocofile)
        LOG.info('Loaded JSON for %d images with %d regions in %d categories',
                 len(coco['images']), len(coco['annotations']),
                 len(coco['categories']))
        categories = dict()
        for cat in coco['categories']:
            categories[cat['id']] = cat['name']
        images = dict()
        for image in coco['images']:
            images[image['id']] = image
        for annotation in coco['annotations']:
            image = images[annotation['image_id']]
            regions = image.setdefault('regions', list())
            regions.append(annotation)
        del coco
        LOG.info('Parsing annotations into PAGE-XML')
        for image in images.values():
            page_id = 'p' + str(image['id'])
            file_base, file_ext = os.path.splitext(image['file_name'])
            filename = file_base + '.xml'
            image_file = workspace.add_file('OCR-D-IMG',
                                            ID='OCR-D-IMG_' + page_id,
                                            pageId=page_id,
                                            mimetype=EXT_TO_MIME[file_ext],
                                            local_filename=image['file_name'])
            LOG.info('Added page %s file %s of type %s', image_file.pageId,
                     image_file.local_filename, image_file.mimetype)
            pcgts = page_from_image(image_file)
            pcgts.set_pcGtsId(page_id)
            page = pcgts.get_Page()
            assert page.imageWidth == image['width']
            assert page.imageHeight == image['height']
            for region in image['regions']:
                polygon = np.array(region['segmentation'])
                polygon = np.reshape(polygon, (polygon.shape[1] // 2, 2))
                coords = CoordsType(points=points_from_polygon(polygon))
                category = categories[region['category_id']]
                region_id = 'r' + str(region['id'])
                if category == 'text':
                    region_obj = TextRegionType(
                        id=region_id,
                        Coords=coords,
                        type_=TextTypeSimpleType.PARAGRAPH)
                    page.add_TextRegion(region_obj)
                elif category == 'title':
                    region_obj = TextRegionType(
                        id=region_id,
                        Coords=coords,
                        type_=TextTypeSimpleType.HEADING)  # CAPTION?
                    page.add_TextRegion(region_obj)
                elif category == 'list':
                    region_obj = TextRegionType(
                        id=region_id,
                        Coords=coords,
                        type_=TextTypeSimpleType.LISTLABEL)  # OTHER?
                    page.add_TextRegion(region_obj)
                elif category == 'table':
                    region_obj = TableRegionType(id=region_id, Coords=coords)
                    page.add_TableRegion(region_obj)
                elif category == 'figure':
                    region_obj = ImageRegionType(id=region_id, Coords=coords)
                    page.add_ImageRegion(region_obj)
                else:
                    raise Exception('unknown image category: %s' % category)
            page_file = workspace.add_file('OCR-D-GT-SEG-BLOCK',
                                           ID='OCR-D-GT-SEG-BLOCK_' + page_id,
                                           pageId=page_id,
                                           mimetype=MIMETYPE_PAGE,
                                           local_filename=filename,
                                           content=to_xml(pcgts))
            LOG.info('Added page %s file %s with %d regions', page_file.pageId,
                     page_file.local_filename, len(image['regions']))
        LOG.info('All done')
        workspace.save_mets()
Exemplo n.º 4
0
class TestResolver(TestCase):
    def setUp(self):
        self.resolver = Resolver()
        self.folder = join(TMP_FOLDER, 'kant_aufklaerung_1784')
        if exists(TMP_FOLDER):
            rmtree(TMP_FOLDER)
            os.makedirs(TMP_FOLDER)
        copytree(FOLDER_KANT, self.folder)

    def test_workspace_from_url_bad(self):
        with self.assertRaisesRegex(Exception,
                                    "Must pass mets_url and/or baseurl"):
            self.resolver.workspace_from_url(None)

    def test_workspace_from_url_tempdir(self):
        self.resolver.workspace_from_url(
            mets_basename='foo.xml',
            mets_url=
            'https://raw.githubusercontent.com/OCR-D/assets/master/data/kant_aufklaerung_1784/data/mets.xml'
        )

    def test_workspace_from_url_download(self):
        with TemporaryDirectory() as dst_dir:
            self.resolver.workspace_from_url(
                mets_basename='foo.xml',
                dst_dir=dst_dir,
                download=True,
                mets_url=
                'https://raw.githubusercontent.com/OCR-D/assets/master/data/kant_aufklaerung_1784/data/mets.xml'
            )

    def test_workspace_from_url_no_clobber(self):
        with self.assertRaisesRegex(
                Exception, "already exists but clobber_mets is false"):
            with TemporaryDirectory() as dst_dir:
                with open(join(dst_dir, 'mets.xml'), 'w') as f:
                    f.write('CONTENT')
                self.resolver.workspace_from_url(
                    dst_dir=dst_dir,
                    mets_url=
                    'https://raw.githubusercontent.com/OCR-D/assets/master/data/kant_aufklaerung_1784/data/mets.xml'
                )

    def test_workspace_from_url_404(self):
        with self.assertRaisesRegex(Exception, "Not found"):
            self.resolver.workspace_from_url(
                mets_url=
                'https://raw.githubusercontent.com/OCR-D/assets/master/data/kant_aufklaerung_1784/data/mets.xmlX'
            )

    def test_workspace_from_url_rel_dir(self):
        with TemporaryDirectory() as dst_dir:
            os.chdir(FOLDER_KANT)
            self.resolver.workspace_from_url(
                None,
                baseurl='data',
                dst_dir='../../../../../../../../../../../../../../../../' +
                dst_dir[1:])
            os.chdir(oldpwd)

    def test_workspace_from_url(self):
        workspace = self.resolver.workspace_from_url(METS_HEROLD)
        #  print(METS_HEROLD)
        #  print(workspace.mets)
        input_files = workspace.mets.find_files(fileGrp='OCR-D-IMG')
        #  print [str(f) for f in input_files]
        image_file = input_files[0]
        #  print(image_file)
        f = workspace.download_file(image_file)
        self.assertEqual(f.ID, 'FILE_0001_IMAGE')
        #  print(f)

    def test_resolve_image(self):
        workspace = self.resolver.workspace_from_url(METS_HEROLD)
        input_files = workspace.mets.find_files(fileGrp='OCR-D-IMG')
        f = input_files[0]
        print(f.url)
        img_pil1 = workspace.resolve_image_as_pil(f.url)
        self.assertEqual(img_pil1.size, (2875, 3749))
        img_pil2 = workspace.resolve_image_as_pil(f.url, [[0, 0], [1, 1]])
        self.assertEqual(img_pil2.size, (1, 1))
        img_pil2 = workspace.resolve_image_as_pil(f.url, [[0, 0], [1, 1]])

    def test_resolve_image_grayscale(self):
        img_url = assets.url_of(
            'kant_aufklaerung_1784-binarized/data/OCR-D-IMG-NRM/OCR-D-IMG-NRM_0017'
        )
        workspace = self.resolver.workspace_from_url(METS_HEROLD)
        img_pil1 = workspace.resolve_image_as_pil(img_url)
        self.assertEqual(img_pil1.size, (1457, 2083))
        img_pil2 = workspace.resolve_image_as_pil(img_url, [[0, 0], [1, 1]])
        self.assertEqual(img_pil2.size, (1, 1))

    def test_resolve_image_bitonal(self):
        img_url = assets.url_of(
            'kant_aufklaerung_1784-binarized/data/OCR-D-IMG-1BIT/OCR-D-IMG-1BIT_0017'
        )
        workspace = self.resolver.workspace_from_url(METS_HEROLD)
        img_pil1 = workspace.resolve_image_as_pil(img_url)
        self.assertEqual(img_pil1.size, (1457, 2083))
        img_pil2 = workspace.resolve_image_as_pil(img_url, [[0, 0], [1, 1]])
        self.assertEqual(img_pil2.size, (1, 1))

    def test_workspace_from_nothing(self):
        ws1 = self.resolver.workspace_from_nothing(None)
        self.assertIsNotNone(ws1.mets)
        tmp_dir = join(TMP_FOLDER, 'from-nothing')
        ws2 = self.resolver.workspace_from_nothing(tmp_dir)
        self.assertEqual(ws2.directory, tmp_dir)
        try:
            ws2 = self.resolver.workspace_from_nothing(tmp_dir)
            self.assertTrue(False, "expecting to fail")
        except Exception as e:
            self.assertTrue('Not clobbering' in str(e))

    def test_download_to_directory_badargs_url(self):
        with self.assertRaisesRegex(Exception, "'url' must be a string"):
            self.resolver.download_to_directory(None, None)

    def test_download_to_directory_badargs_directory(self):
        with self.assertRaisesRegex(Exception, "'directory' must be a string"):
            self.resolver.download_to_directory(None, 'foo')

    def test_download_to_directory_default(self):
        tmp_dir = join(TMP_FOLDER, 'target')
        fn = self.resolver.download_to_directory(
            tmp_dir, 'file://' + join(self.folder, 'data/mets.xml'))
        self.assertEqual(
            fn,
            join(tmp_dir,
                 'file%s.data.mets.xml' % sub(r'[/_\.\-]', '.', self.folder)))

    def test_download_to_directory_basename(self):
        tmp_dir = join(TMP_FOLDER, 'target')
        fn = self.resolver.download_to_directory(
            tmp_dir,
            'file://' + join(self.folder, 'data/mets.xml'),
            basename='foo')
        self.assertEqual(fn, join(tmp_dir, 'foo'))

    def test_download_to_directory_subdir(self):
        tmp_dir = join(TMP_FOLDER, 'target')
        fn = self.resolver.download_to_directory(
            tmp_dir,
            'file://' + join(self.folder, 'data/mets.xml'),
            subdir='baz')
        self.assertEqual(fn, join(tmp_dir, 'baz', 'mets.xml'))

    def test_workspace_add_file(self):
        with TemporaryDirectory() as tempdir:
            ws1 = self.resolver.workspace_from_nothing(directory=tempdir)
            fpath = join(tempdir, 'ID1.tif')
            ws1.add_file('GRP',
                         ID='ID1',
                         mimetype='image/tiff',
                         content='CONTENT',
                         local_filename=fpath)
            f = ws1.mets.find_files()[0]
            self.assertEqual(f.ID, 'ID1')
            self.assertEqual(f.mimetype, 'image/tiff')
            self.assertEqual(f.url, fpath)
            self.assertEqual(f.local_filename, fpath)
            self.assertTrue(exists(fpath))

    def test_workspace_add_file_basename_no_content(self):
        with TemporaryDirectory() as tempdir:
            ws1 = self.resolver.workspace_from_nothing(directory=tempdir)
            ws1.add_file('GRP', ID='ID1', mimetype='image/tiff')
            f = ws1.mets.find_files()[0]
            self.assertEqual(f.url, '')

    def test_workspace_add_file_binary_content(self):
        with TemporaryDirectory() as tempdir:
            ws1 = self.resolver.workspace_from_nothing(directory=tempdir)
            fpath = join(tempdir, 'subdir', 'ID1.tif')
            ws1.add_file('GRP',
                         ID='ID1',
                         content=b'CONTENT',
                         local_filename=fpath,
                         url='http://foo/bar')
            self.assertTrue(exists(fpath))

    def test_workspacec_add_file_content_wo_local_filename(self):
        with TemporaryDirectory() as tempdir:
            ws1 = self.resolver.workspace_from_nothing(directory=tempdir)
            with self.assertRaisesRegex(
                    Exception, "'content' was set but no 'local_filename'"):
                ws1.add_file('GRP', ID='ID1', content=b'CONTENT')

    def test_workspace_str(self):
        with TemporaryDirectory() as tempdir:
            ws1 = self.resolver.workspace_from_nothing(directory=tempdir)
            ws1.save_mets()
            ws1.reload_mets()
            self.assertEqual(
                str(ws1),
                'Workspace[directory=%s, file_groups=[], files=[]]' % tempdir)

    def test_workspace_backup(self):
        with TemporaryDirectory() as tempdir:
            ws1 = self.resolver.workspace_from_nothing(directory=tempdir)
            ws1.automatic_backup = True
            ws1.save_mets()
            ws1.reload_mets()
            self.assertEqual(
                str(ws1),
                'Workspace[directory=%s, file_groups=[], files=[]]' % tempdir)

    def test_227_1(self):
        def find_recursive(root):
            ret = []
            for _, _, f in os.walk(root):
                for file in f:
                    ret.append(file)
            return ret

        with TemporaryDirectory() as wsdir:
            with open(
                    assets.path_to(
                        'SBB0000F29300010000/data/mets_one_file.xml'),
                    'r') as f_in:
                with open(join(wsdir, 'mets.xml'), 'w') as f_out:
                    f_out.write(f_in.read())
            self.assertEqual(len(find_recursive(wsdir)), 1)
            ws1 = Workspace(self.resolver, wsdir)
            for file in ws1.mets.find_files():
                ws1.download_file(file)
            self.assertEqual(len(find_recursive(wsdir)), 2)
            self.assertTrue(exists(join(wsdir, 'OCR-D-IMG/FILE_0005_IMAGE')))
Exemplo n.º 5
0
class TestCli(TestCase):
    def setUp(self):
        self.maxDiff = None
        self.resolver = Resolver()
        initLogging()
        self.runner = CliRunner()

    def test_add(self):
        """
        Ensure that `ocrd workspace add` does the right thing
        """
        ID = 'foo123file'
        page_id = 'foo123page'
        file_grp = 'TEST_GROUP'
        content = 'x'
        mimetype = 'image/tiff'
        local_filename = join(file_grp, 'foo.xml')

        #  mets_api = None
        #  mets_cli = None

        with TemporaryDirectory() as tempdir:
            ws_api = self.resolver.workspace_from_nothing(directory=tempdir)
            ws_api.add_file(file_grp,
                            ID=ID,
                            content=content,
                            pageId=page_id,
                            mimetype=mimetype,
                            local_filename=local_filename)
            ws_api.save_mets()
            #  mets_api = ws_api.mets.to_xml().decode('utf8')

        with TemporaryDirectory() as tempdir:
            ws_api = self.resolver.workspace_from_nothing(directory=tempdir)
            content_file = join(tempdir, 'testfile')
            with open(content_file, 'w') as f:
                f.write(content)
                result = self.runner.invoke(workspace_cli, [
                    '-d', tempdir, 'add', '--file-grp', file_grp, '--page-id',
                    page_id, '--file-id', ID, '--mimetype', mimetype,
                    content_file
                ])
                self.assertEqual(result.exit_code, 0)
                # TODO too complex to compare :(
                #  with open(join(tempdir, 'mets.xml')) as f:
                #      mets_cli = f.read()
                #  print(mets_api)
                #  print(mets_cli)
                #  self.assertEqual(mets_api, mets_cli)
                #  print(result.output)
                #  with open(join(tempdir, 'mets.xml')) as f:
                #      print(f.read())
                self.assertEqual(result.exit_code, 0)

    def test_add_remove(self):
        ID = 'foo123file'
        page_id = 'foo123page'
        file_grp = 'TEST_GROUP'
        content = 'x'
        mimetype = 'image/tiff'
        with TemporaryDirectory() as tempdir:
            content_file = join(tempdir, 'testfile')
            with open(content_file, 'w') as f:
                f.write(content)

            result = self.runner.invoke(workspace_cli, ['init', tempdir])
            self.assertEqual(result.exit_code, 0)

            result = self.runner.invoke(workspace_cli, [
                '-d', tempdir, 'add', '--file-grp', file_grp, '--page-id',
                page_id, '--file-id', ID, '--mimetype', mimetype, content_file
            ])
            self.assertEqual(result.exit_code, 0)

            result = self.runner.invoke(workspace_cli,
                                        ['-d', tempdir, 'remove', ID])
            self.assertEqual(result.exit_code, 0)

            # File should still exist
            self.assertTrue(exists(content_file))

    def test_add_remove_force(self):
        ID = 'foo123file'
        page_id = 'foo123page'
        file_grp = 'TEST_GROUP'
        content = 'x'
        mimetype = 'image/tiff'
        with TemporaryDirectory() as tempdir:
            content_file = join(tempdir, 'testfile')
            with open(content_file, 'w') as f:
                f.write(content)

            result = self.runner.invoke(workspace_cli, ['init', tempdir])
            self.assertEqual(result.exit_code, 0)

            result = self.runner.invoke(workspace_cli, [
                '-d', tempdir, 'add', '--file-grp', file_grp, '--page-id',
                page_id, '--file-id', ID, '--mimetype', mimetype, content_file
            ])
            self.assertEqual(result.exit_code, 0)

            result = self.runner.invoke(
                workspace_cli, ['-d', tempdir, 'remove', '--force', ID])
            print(result)
            print(result.output)
            self.assertEqual(result.exit_code, 0)

            # File should have been deleted
            self.assertFalse(exists(content_file))

    def test_find_files(self):
        with TemporaryDirectory() as tempdir:
            wsdir = join(tempdir, 'ws')
            copytree(assets.path_to('SBB0000F29300010000/data'), wsdir)
            with pushd_popd(wsdir):
                result = self.runner.invoke(
                    workspace_cli,
                    ['find', '-G', 'OCR-D-IMG-BIN', '-k', 'fileGrp'])
                self.assertEqual(result.output,
                                 'OCR-D-IMG-BIN\nOCR-D-IMG-BIN\n')
                self.assertEqual(result.exit_code, 0)

    def test_prune_files(self):
        with TemporaryDirectory() as tempdir:
            copytree(assets.path_to('SBB0000F29300010000/data'),
                     join(tempdir, 'ws'))

            ws1 = self.resolver.workspace_from_url(
                join(tempdir, 'ws', 'mets.xml'))
            self.assertEqual(len(ws1.mets.find_files()), 35)

            result = self.runner.invoke(
                workspace_cli, ['-d', join(tempdir, 'ws'), 'prune-files'])
            self.assertEqual(result.exit_code, 0)

            ws2 = self.resolver.workspace_from_url(
                join(tempdir, 'ws', 'mets.xml'))
            self.assertEqual(len(ws2.mets.find_files()), 7)

    def test_remove_file_group(self):
        """
        Test removal of filegrp
        """
        with TemporaryDirectory() as tempdir:
            wsdir = join(tempdir, 'ws')
            copytree(assets.path_to('SBB0000F29300010000/data'), wsdir)
            file_group = 'OCR-D-GT-PAGE'
            file_path = join(tempdir, 'ws', file_group,
                             'FILE_0002_FULLTEXT.xml')
            self.assertTrue(exists(file_path))

            workspace = self.resolver.workspace_from_url(
                join(wsdir, 'mets.xml'))
            self.assertEqual(workspace.directory, wsdir)

            with self.assertRaisesRegex(Exception, "not empty"):
                workspace.remove_file_group(file_group)
            with self.assertRaisesRegex(Exception, "force without recursive"):
                workspace.remove_file_group(file_group, force=True)

            self.assertTrue(exists(file_path))
            self.assertEqual(len(workspace.mets.file_groups), 17)
            self.assertEqual(len(workspace.mets.find_files()), 35)

            workspace.remove_file_group(file_group, recursive=True, force=True)

            self.assertEqual(len(workspace.mets.file_groups), 16)
            self.assertEqual(len(workspace.mets.find_files()), 33)
            self.assertFalse(exists(file_path))

    def test_copy_vs_clone(self):
        src_dir = assets.path_to('kant_aufklaerung_1784/data')
        with TemporaryDirectory() as tempdir:
            # cloned without download
            shallowcloneddir = join(tempdir, 'cloned-shallow')
            # cloned with download
            fullcloneddir = join(tempdir, 'cloned-all')
            # copied
            copieddir = join(tempdir, 'copied')

            Path(fullcloneddir).mkdir()
            Path(shallowcloneddir).mkdir()

            result = self.runner.invoke(
                workspace_cli,
                ['clone', join(src_dir, 'mets.xml'), shallowcloneddir])
            self.assertEqual(result.exit_code, 0)

            result = self.runner.invoke(
                workspace_cli,
                ['clone', '-a',
                 join(src_dir, 'mets.xml'), fullcloneddir])
            self.assertEqual(result.exit_code, 0)

            with copy_of_directory(src_dir, copieddir):
                shallow_vs_copied = dircmp(shallowcloneddir, copieddir)
                self.assertEqual(
                    set(shallow_vs_copied.right_only),
                    set(['OCR-D-GT-ALTO', 'OCR-D-GT-PAGE', 'OCR-D-IMG']))

                full_vs_copied = dircmp(fullcloneddir, copieddir)
                #  print(full_vs_copied)
                #  from ocrd_utils import pushd_popd
                #  with pushd_popd(tempdir):
                #  import os
                #  os.system("diff %s/mets.xml %s/mets.xml" % (fullcloneddir, copieddir))
                # XXX mets.xml will not have the exact same content because
                # URLs that are actually files will be marked up as such with
                # @LOCTYPE/@OTHERLOCTYPE
                #  self.assertEqual(full_vs_copied.diff_files, [])
                self.assertEqual(full_vs_copied.left_only, [])
                self.assertEqual(full_vs_copied.right_only, [])
Exemplo n.º 6
0
class TestResolver(TestCase):

    def setUp(self):
        self.resolver = Resolver()

    def test_workspace_from_url_bad(self):
        with self.assertRaisesRegex(Exception, "Must pass 'mets_url'"):
            self.resolver.workspace_from_url(None)

    def test_workspace_from_url_tempdir(self):
        self.resolver.workspace_from_url(
            mets_basename='foo.xml',
            mets_url='https://raw.githubusercontent.com/OCR-D/assets/master/data/kant_aufklaerung_1784/data/mets.xml')

    def test_workspace_from_url_download(self):
        with TemporaryDirectory() as dst_dir:
            self.resolver.workspace_from_url(
                'https://raw.githubusercontent.com/OCR-D/assets/master/data/kant_aufklaerung_1784/data/mets.xml',
                mets_basename='foo.xml',
                dst_dir=dst_dir,
                download=True)

    def test_workspace_from_url_no_clobber(self):
        with TemporaryDirectory() as dst_dir:
            src_mets = Path(assets.path_to('kant_aufklaerung_1784-binarized/data/mets.xml'))
            dst_mets = Path(dst_dir, 'mets.xml')
            dst_mets.write_text(src_mets.read_text())
            self.resolver.workspace_from_url(
                    'https://raw.githubusercontent.com/OCR-D/assets/master/data/kant_aufklaerung_1784/data/mets.xml',
                    clobber_mets=False,
                    dst_dir=dst_dir)

    def test_workspace_from_url_404(self):
        with self.assertRaisesRegex(Exception, "HTTP request failed"):
            self.resolver.workspace_from_url(mets_url='https://raw.githubusercontent.com/OCR-D/assets/master/data/kant_aufklaerung_1784/data/mets.xmlX')

    def test_workspace_from_url_rel_dir(self):
        with TemporaryDirectory() as dst_dir:
            bogus_dst_dir = '../../../../../../../../../../../../../../../../%s'  % dst_dir[1:]
            with pushd_popd(FOLDER_KANT):
                ws1 = self.resolver.workspace_from_url('data/mets.xml', dst_dir=bogus_dst_dir)
                self.assertEqual(ws1.mets_target, pjoin(dst_dir, 'mets.xml'))
                self.assertEqual(ws1.directory, dst_dir)

    def test_workspace_from_url0(self):
        workspace = self.resolver.workspace_from_url(METS_HEROLD)
        #  print(workspace.mets)
        input_files = workspace.mets.find_files(fileGrp='OCR-D-IMG')
        #  print [str(f) for f in input_files]
        image_file = input_files[0]
        #  print(image_file)
        f = workspace.download_file(image_file)
        self.assertEqual('%s.tif' % f.ID, 'FILE_0001_IMAGE.tif')
        self.assertEqual(f.local_filename, 'OCR-D-IMG/FILE_0001_IMAGE.tif')
        #  print(f)

    # pylint: disable=protected-access
    def test_resolve_image0(self):
        workspace = self.resolver.workspace_from_url(METS_HEROLD)
        input_files = workspace.mets.find_files(fileGrp='OCR-D-IMG')
        f = input_files[0]
        print(f.url)
        img_pil1 = workspace._resolve_image_as_pil(f.url)
        print(f.url)
        self.assertEqual(img_pil1.size, (2875, 3749))
        img_pil2 = workspace._resolve_image_as_pil(f.url, [[0, 0], [1, 1]])
        print(f.url)
        self.assertEqual(img_pil2.size, (1, 1))
        img_pil2 = workspace._resolve_image_as_pil(f.url, [[0, 0], [1, 1]])

    # pylint: disable=protected-access
    def test_resolve_image_grayscale(self):
        img_url = assets.url_of('kant_aufklaerung_1784-binarized/data/OCR-D-IMG-NRM/OCR-D-IMG-NRM_0017.png')
        workspace = self.resolver.workspace_from_url(assets.url_of('SBB0000F29300010000/data/mets.xml'))
        img_pil1 = workspace.resolve_image_as_pil(img_url)
        self.assertEqual(img_pil1.size, (1457, 2083))
        img_pil2 = workspace._resolve_image_as_pil(img_url, [[0, 0], [1, 1]])
        self.assertEqual(img_pil2.size, (1, 1))

    # pylint: disable=protected-access
    def test_resolve_image_bitonal(self):
        img_url = assets.url_of('kant_aufklaerung_1784-binarized/data/OCR-D-IMG-1BIT/OCR-D-IMG-1BIT_0017.png')
        workspace = self.resolver.workspace_from_url(METS_HEROLD)
        img_pil1 = workspace._resolve_image_as_pil(img_url)
        self.assertEqual(img_pil1.size, (1457, 2083))
        img_pil2 = workspace._resolve_image_as_pil(img_url, [[0, 0], [1, 1]])
        self.assertEqual(img_pil2.size, (1, 1))

    def test_workspace_from_nothing(self):
        ws1 = self.resolver.workspace_from_nothing(None)
        self.assertIsNotNone(ws1.mets)

    def test_workspace_from_nothing_makedirs(self):
        with TemporaryDirectory() as tempdir:
            non_existant_dir = Path(tempdir, 'target')
            ws1 = self.resolver.workspace_from_nothing(non_existant_dir)
            self.assertEqual(ws1.directory, non_existant_dir)

    def test_workspace_from_nothing_noclobber(self):
        with TemporaryDirectory() as tempdir:
            ws2 = self.resolver.workspace_from_nothing(tempdir)
            self.assertEqual(ws2.directory, tempdir)
            with self.assertRaisesRegex(Exception, "METS 'mets.xml' already exists in '%s' and clobber_mets not set" % tempdir):
                # must fail because tempdir was just created
                self.resolver.workspace_from_nothing(tempdir)

    def test_download_to_directory_badargs_url(self):
        with self.assertRaisesRegex(Exception, "'url' must be a string"):
            self.resolver.download_to_directory(None, None)

    def test_download_to_directory_badargs_directory(self):
        with self.assertRaisesRegex(Exception, "'directory' must be a string"):
            self.resolver.download_to_directory(None, 'foo')

    def test_download_to_directory_default(self):
        with copy_of_directory(FOLDER_KANT) as src:
            with TemporaryDirectory() as dst:
                fn = self.resolver.download_to_directory(dst, pjoin(src, 'data/mets.xml'))
                self.assertEqual(fn, 'mets.xml')
                self.assertTrue(Path(dst, fn).exists())

    def test_download_to_directory_basename(self):
        with copy_of_directory(FOLDER_KANT) as src:
            with TemporaryDirectory() as dst:
                fn = self.resolver.download_to_directory(dst, pjoin(src, 'data/mets.xml'), basename='foo')
                self.assertEqual(fn, 'foo')
                self.assertTrue(Path(dst, fn).exists())

    def test_download_to_directory_subdir(self):
        with copy_of_directory(FOLDER_KANT) as src:
            with TemporaryDirectory() as dst:
                fn = self.resolver.download_to_directory(dst, pjoin(src, 'data/mets.xml'), subdir='baz')
                self.assertEqual(fn, pjoin('baz', 'mets.xml'))
                self.assertTrue(Path(dst, fn).exists())
class TestWorkspaceValidator(TestCase):
    def setUp(self):
        super().setUp()
        self.resolver = Resolver()

    def test_check_file_grp_basic(self):
        workspace = self.resolver.workspace_from_url(
            assets.url_of('SBB0000F29300010000/data/mets.xml'))
        report = WorkspaceValidator.check_file_grp(workspace, 'foo', 'bar')
        self.assertFalse(report.is_valid)
        self.assertEqual(len(report.errors), 1)
        self.assertEqual(report.errors[0],
                         "Input fileGrp[@USE='foo'] not in METS!")
        report = WorkspaceValidator.check_file_grp(workspace, 'OCR-D-IMG',
                                                   'OCR-D-IMG-BIN')
        self.assertFalse(report.is_valid)
        self.assertEqual(len(report.errors), 1)
        self.assertEqual(
            report.errors[0],
            "Output fileGrp[@USE='OCR-D-IMG-BIN'] already in METS!")
        report = WorkspaceValidator.check_file_grp(workspace, 'OCR-D-IMG,FOO',
                                                   'FOO')
        self.assertFalse(report.is_valid)
        self.assertEqual(len(report.errors), 1)
        self.assertEqual(report.errors[0],
                         "Input fileGrp[@USE='FOO'] not in METS!")
        report = WorkspaceValidator.check_file_grp(workspace, 'OCR-D-IMG,FOO',
                                                   None)
        self.assertFalse(report.is_valid)
        self.assertEqual(len(report.errors), 1)
        self.assertEqual(report.errors[0],
                         "Input fileGrp[@USE='FOO'] not in METS!")
        report = WorkspaceValidator.check_file_grp(workspace, None, '')
        self.assertTrue(report.is_valid)

    def test_check_file_grp_page_id_str(self):
        workspace = self.resolver.workspace_from_url(
            assets.url_of('SBB0000F29300010000/data/mets.xml'))
        report = WorkspaceValidator.check_file_grp(
            workspace,
            'OCR-D-IMG',
            'OCR-D-IMG-BIN',
            page_id='PHYS_0003,PHYS_0001')
        self.assertFalse(report.is_valid)
        self.assertEqual(len(report.errors), 1)
        self.assertEqual(
            report.errors[0],
            "Output fileGrp[@USE='OCR-D-IMG-BIN'] already contains output for page PHYS_0001"
        )

    def test_check_file_grp_page_id_list(self):
        workspace = self.resolver.workspace_from_url(
            assets.url_of('SBB0000F29300010000/data/mets.xml'))
        report = WorkspaceValidator.check_file_grp(
            workspace,
            'OCR-D-IMG',
            'OCR-D-IMG-BIN',
            page_id=['PHYS_0003', 'PHYS_0001'])
        self.assertFalse(report.is_valid)
        self.assertEqual(len(report.errors), 1)

    def test_check_file_grp_page_id_valid(self):
        workspace = self.resolver.workspace_from_url(
            assets.url_of('SBB0000F29300010000/data/mets.xml'))
        report = WorkspaceValidator.check_file_grp(workspace,
                                                   'OCR-D-IMG',
                                                   'OCR-D-IMG-BIN',
                                                   page_id='PHYS_0004')
        self.assertTrue(report.is_valid)

    def test_simple(self):
        report = WorkspaceValidator.validate(
            self.resolver,
            assets.url_of('SBB0000F29300010000/data/mets_one_file.xml'),
            download=True)
        self.assertTrue(report.is_valid)

    def test_validate_twice(self):
        validator = WorkspaceValidator(
            self.resolver,
            assets.url_of('SBB0000F29300010000/data/mets_one_file.xml'),
            download=True)
        report = validator._validate()  # pylint: disable=protected-access
        report = validator._validate()  # pylint: disable=protected-access
        print(report.errors)
        self.assertTrue(report.is_valid)

    def test_validate_empty(self):
        with TemporaryDirectory() as tempdir:
            workspace = self.resolver.workspace_from_nothing(directory=tempdir)
            report = WorkspaceValidator.validate(self.resolver,
                                                 join(tempdir, 'mets.xml'))
            self.assertEqual(len(report.errors),
                             3)  # no-files, missing id, missing fileGrp
            self.assertIn('no unique identifier', report.errors[0])
            self.assertIn('No files', report.errors[1])
            workspace.mets.unique_identifier = 'foobar'
            workspace.save_mets()
            report = WorkspaceValidator.validate(self.resolver,
                                                 join(tempdir, 'mets.xml'))
            self.assertEqual(len(report.errors), 2)

    def test_validate_file_groups_non_ocrd(self):
        with TemporaryDirectory() as tempdir:
            workspace = self.resolver.workspace_from_nothing(directory=tempdir)
            workspace.mets.unique_identifier = 'foobar'
            workspace.mets.add_file_group('FOO')
            workspace.save_mets()
            report = WorkspaceValidator.validate(self.resolver,
                                                 join(tempdir, 'mets.xml'))
            self.assertEqual(len(report.errors), 1)
            self.assertIn('No files', report.errors[0])
            self.assertEqual(len(report.notices), 1)
            self.assertIn("USE does not begin with 'OCR-D-'",
                          report.notices[0])

    def test_validate_file_groups_unspecified(self):
        with TemporaryDirectory() as tempdir:
            workspace = self.resolver.workspace_from_nothing(directory=tempdir)
            workspace.mets.unique_identifier = 'foobar'
            workspace.mets.add_file_group('OCR-D-INVALID-FILEGRP')
            workspace.save_mets()
            report = WorkspaceValidator.validate(self.resolver,
                                                 join(tempdir, 'mets.xml'))
            print(report.notices)
            self.assertEqual(len(report.errors), 1)
            self.assertEqual(len(report.notices), 1)
            self.assertEqual(
                report.notices[0],
                "Unspecified USE category 'INVALID' in fileGrp 'OCR-D-INVALID-FILEGRP'"
            )
            self.assertIn('No files', report.errors[0])

    def test_validate_file_groups_bad_name(self):
        with TemporaryDirectory() as tempdir:
            workspace = self.resolver.workspace_from_nothing(directory=tempdir)
            workspace.mets.unique_identifier = 'foobar'
            workspace.mets.add_file_group('OCR-D-GT-X')
            workspace.save_mets()
            report = WorkspaceValidator.validate(self.resolver,
                                                 join(tempdir, 'mets.xml'))
            self.assertEqual(len(report.errors), 1)
            self.assertEqual(len(report.notices), 1)
            self.assertIn("Invalid USE name 'X' in fileGrp", report.notices[0])
            self.assertIn('No files', report.errors[0])

    def test_validate_files_nopageid(self):
        with TemporaryDirectory() as tempdir:
            workspace = self.resolver.workspace_from_nothing(directory=tempdir)
            workspace.mets.unique_identifier = 'foobar'
            workspace.mets.add_file('OCR-D-GT-PAGE',
                                    ID='file1',
                                    mimetype='image/png',
                                    url='http://foo')
            workspace.save_mets()
            report = WorkspaceValidator.validate(
                self.resolver,
                join(tempdir, 'mets.xml'),
                skip=['pixel_density', 'imagefilename'])
            self.assertEqual(len(report.errors), 1)
            self.assertIn("does not manifest any physical page.",
                          report.errors[0])

    def test_validate_weird_urls(self):
        with TemporaryDirectory() as tempdir:
            workspace = self.resolver.workspace_from_nothing(directory=tempdir)
            workspace.mets.unique_identifier = 'foobar'
            workspace.mets.add_file('OCR-D-GT-PAGE',
                                    ID='file1',
                                    mimetype='image/png',
                                    pageId='page1',
                                    url='file:/java-file-url')
            f = workspace.mets.add_file('OCR-D-GT-PAGE',
                                        ID='file2',
                                        mimetype='image/png',
                                        pageId='page2',
                                        url='nothttp://unusual.scheme')
            f._el.set('GROUPID', 'donotuse')  # pylint: disable=protected-access
            workspace.save_mets()
            report = WorkspaceValidator.validate(self.resolver,
                                                 join(tempdir, 'mets.xml'),
                                                 skip=['pixel_density'])
            assert not report.is_valid
            assert len(report.errors) == 2
            assert "invalid (Java-specific) file URL" in report.errors[0]

    def test_validate_pixel_no_download(self):
        imgpath = assets.path_to(
            'kant_aufklaerung_1784-binarized/data/OCR-D-IMG-BIN/BIN_0020.png')
        with TemporaryDirectory() as tempdir:
            workspace = self.resolver.workspace_from_nothing(directory=tempdir)
            workspace.mets.unique_identifier = 'foobar'
            workspace.mets.add_file('OCR-D-GT-BIN',
                                    ID='file1',
                                    mimetype='image/png',
                                    pageId='page1',
                                    url=imgpath)
            workspace.save_mets()
            report = WorkspaceValidator.validate(self.resolver,
                                                 join(tempdir, 'mets.xml'),
                                                 skip=[],
                                                 download=False)
            self.assertEqual(len(report.errors), 0)
            self.assertEqual(len(report.warnings), 0)
            self.assertEqual(len(report.notices), 0)

    def test_validate_pixel_density_too_low(self):
        imgpath = assets.path_to(
            'kant_aufklaerung_1784-binarized/data/OCR-D-IMG-BIN/BIN_0017.png')
        with TemporaryDirectory() as tempdir:
            workspace = self.resolver.workspace_from_nothing(directory=tempdir)
            workspace.mets.unique_identifier = 'foobar'
            workspace.mets.add_file('OCR-D-GT-BIN',
                                    ID='file1',
                                    mimetype='image/png',
                                    pageId='page1',
                                    url=imgpath)
            workspace.save_mets()
            report = WorkspaceValidator.validate(self.resolver,
                                                 join(tempdir, 'mets.xml'),
                                                 skip=[],
                                                 download=True)
            self.assertEqual(len(report.notices), 2)
            self.assertIn("xResolution", report.notices[0])
            self.assertIn("yResolution", report.notices[1])
            self.assertEqual(len(report.warnings), 0)
            self.assertEqual(len(report.errors), 0)

    def test_bad_workspace(self):
        report = WorkspaceValidator.validate(self.resolver, 'non existe')
        self.assertFalse(report.is_valid)
        self.assertIn('Failed to instantiate workspace:', report.errors[0])

    def test_skip_page(self):
        report = WorkspaceValidator.validate(
            self.resolver,
            None,
            src_dir=assets.path_to('kant_aufklaerung_1784/data'),
            download=True,
            skip=[
                'page',
                'mets_unique_identifier',
                'mets_file_group_names',
                'mets_files',
                'pixel_density',
                'imagefilename',
            ])
        print(report.errors)
        self.assertTrue(report.is_valid)

    def test_dimensions(self):
        with TemporaryDirectory() as tempdir:
            wsdir = join(tempdir, 'foo')
            copytree(assets.path_to('kant_aufklaerung_1784/data'), wsdir)
            with pushd_popd(wsdir):
                os.system(
                    """sed -i 's,imageHeight="2083",imageHeight="1234",' OCR-D-GT-PAGE/PAGE_0017_PAGE.xml"""
                )
                report = WorkspaceValidator.validate(
                    self.resolver,
                    join(wsdir, 'mets.xml'),
                    src_dir=wsdir,
                    skip=[
                        'page', 'mets_unique_identifier',
                        'mets_file_group_names', 'mets_files', 'pixel_density',
                        'imagefilename', 'page_xsd', 'mets_xsd'
                    ],
                    download=True)
                self.assertIn(
                    "PAGE 'PAGE_0017_PAGE': @imageHeight != image's actual height (1234 != 2083)",
                    report.errors)
                #  print(report.errors)
                self.assertEqual(len(report.errors), 1)
                self.assertEqual(report.is_valid, False)
                report2 = WorkspaceValidator.validate(
                    self.resolver,
                    join(wsdir, 'mets.xml'),
                    src_dir=wsdir,
                    skip=[
                        'page', 'mets_unique_identifier',
                        'mets_file_group_names', 'mets_files', 'pixel_density',
                        'imagefilename', 'dimension', 'page_xsd', 'mets_xsd'
                    ],
                    download=False)
            self.assertEqual(report2.is_valid, True)

    def test_src_dir(self):
        report = WorkspaceValidator.validate(
            self.resolver,
            None,
            src_dir=assets.path_to('kant_aufklaerung_1784/data'),
            skip=['imagefilename'],
            download=True,
        )
        print(report.errors)
        self.assertEqual(
            len([e for e in report.errors if isinstance(e, ConsistencyError)]),
            42, '42 textequiv consistency errors')

    def test_imagefilename(self):
        report = WorkspaceValidator.validate(
            self.resolver,
            None,
            src_dir=assets.path_to('kant_aufklaerung_1784/data'),
            skip=[
                'page', 'mets_unique_identifier', 'mets_file_group_names',
                'mets_files', 'pixel_density', 'page_xsd', 'mets_xsd'
            ],
            download=False,
        )
        self.assertEqual(len(report.errors), 0)

    def test_pcgtsid(self):
        with copy_of_directory(
                assets.path_to('kant_aufklaerung_1784/data')) as wsdir:
            with pushd_popd(wsdir):
                # remove the @pcGtsId attribute for testing
                os.system(
                    """sed -i 's,pcGtsId.*,pcGtsId="foo">,' OCR-D-GT-PAGE/PAGE_0017_PAGE.xml"""
                )
                report = WorkspaceValidator.validate(self.resolver,
                                                     join(wsdir, 'mets.xml'))
                self.assertIn(
                    'pc:PcGts/@pcGtsId differs from mets:file/@ID: "foo" !== "PAGE_0017_PAGE"',
                    report.warnings)
Exemplo n.º 8
0
class TestWorkspace(TestCase):
    def setUp(self):
        self.resolver = Resolver()

    def test_workspace_add_file(self):
        with TemporaryDirectory() as tempdir:
            ws1 = self.resolver.workspace_from_nothing(directory=tempdir)
            fpath = join(tempdir, 'ID1.tif')
            ws1.add_file('GRP',
                         ID='ID1',
                         mimetype='image/tiff',
                         content='CONTENT',
                         pageId=None,
                         local_filename=fpath)
            f = ws1.mets.find_all_files()[0]
            self.assertEqual(f.ID, 'ID1')
            self.assertEqual(f.mimetype, 'image/tiff')
            self.assertEqual(f.url, fpath)
            self.assertEqual(f.local_filename, fpath)
            self.assertTrue(exists(fpath))

    def test_workspace_add_file_basename_no_content(self):
        with TemporaryDirectory() as tempdir:
            ws1 = self.resolver.workspace_from_nothing(directory=tempdir)
            ws1.add_file('GRP', ID='ID1', mimetype='image/tiff', pageId=None)
            f = next(ws1.mets.find_files())
            self.assertEqual(f.url, None)

    def test_workspace_add_file_binary_content(self):
        with TemporaryDirectory() as tempdir:
            ws1 = self.resolver.workspace_from_nothing(directory=tempdir)
            fpath = join(tempdir, 'subdir', 'ID1.tif')
            ws1.add_file('GRP',
                         ID='ID1',
                         content=b'CONTENT',
                         local_filename=fpath,
                         url='http://foo/bar',
                         pageId=None)
            self.assertTrue(exists(fpath))

    def test_workspacec_add_file_content_wo_local_filename(self):
        with TemporaryDirectory() as tempdir:
            ws1 = self.resolver.workspace_from_nothing(directory=tempdir)
            with self.assertRaisesRegex(
                    Exception, "'content' was set but no 'local_filename'"):
                ws1.add_file('GRP',
                             ID='ID1',
                             content=b'CONTENT',
                             pageId='foo1234')

    def test_workspacec_add_file_content_wo_pageid(self):
        with TemporaryDirectory() as tempdir:
            ws1 = self.resolver.workspace_from_nothing(directory=tempdir)
            with self.assertRaisesRegex(
                    ValueError,
                    "workspace.add_file must be passed a 'pageId' kwarg, even if it is None."
            ):
                ws1.add_file('GRP',
                             ID='ID1',
                             content=b'CONTENT',
                             local_filename='foo')

    def test_workspace_str(self):
        with TemporaryDirectory() as tempdir:
            ws1 = self.resolver.workspace_from_nothing(directory=tempdir)
            ws1.save_mets()
            ws1.reload_mets()
            self.assertEqual(
                str(ws1),
                'Workspace[directory=%s, baseurl=None, file_groups=[], files=[]]'
                % tempdir)

    def test_workspace_backup(self):
        with TemporaryDirectory() as tempdir:
            ws1 = self.resolver.workspace_from_nothing(directory=tempdir)
            ws1.automatic_backup = True
            ws1.save_mets()
            ws1.reload_mets()
            self.assertEqual(
                str(ws1),
                'Workspace[directory=%s, baseurl=None, file_groups=[], files=[]]'
                % tempdir)

    def test_download_url0(self):
        with TemporaryDirectory() as directory:
            ws1 = self.resolver.workspace_from_nothing(directory)
            fn = ws1.download_url(abspath(__file__))
            self.assertEqual(fn, join('TEMP', basename(__file__)))

    def test_download_url_without_baseurl(self):
        with TemporaryDirectory() as tempdir:
            dst_mets = join(tempdir, 'mets.xml')
            copyfile(SRC_METS, dst_mets)
            ws1 = self.resolver.workspace_from_url(dst_mets)
            with self.assertRaisesRegex(
                    Exception,
                    "Already tried prepending baseurl '%s'" % tempdir):
                ws1.download_url(SAMPLE_FILE_URL)

    def test_download_url_with_baseurl(self):
        with TemporaryDirectory() as tempdir:
            dst_mets = join(tempdir, 'mets.xml')
            copyfile(SRC_METS, dst_mets)
            ws1 = self.resolver.workspace_from_url(
                dst_mets, src_baseurl=dirname(SRC_METS))
            f = Path(ws1.download_url(SAMPLE_FILE_URL))
            self.assertEqual(f, Path('TEMP', '%s.tif' % SAMPLE_FILE_ID))
            self.assertTrue(Path(ws1.directory, f).exists())

    def test_from_url_dst_dir_download(self):
        """
        https://github.com/OCR-D/core/issues/319
        """
        with TemporaryDirectory() as tempdir:
            ws_dir = join(tempdir, 'non-existing-for-good-measure')
            # Create a relative path to trigger #319
            src_path = str(
                Path(assets.path_to(
                    'kant_aufklaerung_1784/data/mets.xml')).relative_to(
                        Path.cwd()))
            self.resolver.workspace_from_url(src_path,
                                             dst_dir=ws_dir,
                                             download=True)
            self.assertTrue(
                Path(ws_dir,
                     'mets.xml').exists())  # sanity check, mets.xml must exist
            self.assertTrue(
                Path(ws_dir, 'OCR-D-GT-PAGE/PAGE_0017_PAGE.xml').exists())

    def test_superfluous_copies_in_ws_dir(self):
        """
        https://github.com/OCR-D/core/issues/227
        """
        def find_recursive(root):
            ret = []
            for _, _, f in walk(root):
                for file in f:
                    ret.append(file)
            return ret

        with TemporaryDirectory() as wsdir:
            with open(
                    assets.path_to(
                        'SBB0000F29300010000/data/mets_one_file.xml'),
                    'r') as f_in:
                with open(join(wsdir, 'mets.xml'), 'w') as f_out:
                    f_out.write(f_in.read())
            self.assertEqual(len(find_recursive(wsdir)), 1)
            ws1 = Workspace(self.resolver, wsdir)
            for file in ws1.mets.find_all_files():
                ws1.download_file(file)
            self.assertEqual(len(find_recursive(wsdir)), 2)
            self.assertTrue(
                exists(join(wsdir, 'OCR-D-IMG/FILE_0005_IMAGE.tif')))

    def test_remove_file_force(self):
        with copy_of_directory(
                assets.path_to('SBB0000F29300010000/data')) as tempdir:
            workspace = Workspace(self.resolver, directory=tempdir)
            with self.assertRaisesRegex(FileNotFoundError, "not found"):
                # should fail
                workspace.remove_file('non-existing-id')
            # should succeed
            workspace.remove_file('non-existing-id', force=True)
            # should also succeed
            workspace.overwrite_mode = True
            workspace.remove_file('non-existing-id', force=False)

    def test_remove_file_remote(self):
        with TemporaryDirectory() as tempdir:
            ws = self.resolver.workspace_from_nothing(directory=tempdir)
            ws.add_file('IMG',
                        ID='page1_img',
                        mimetype='image/tiff',
                        url='http://remote',
                        pageId=None)
            with self.assertRaisesRegex(Exception, "not locally available"):
                # should fail
                ws.remove_file('page1_img')
            # should succeed
            ws.remove_file('page1_img', force=True)
            # should also succeed
            ws.overwrite_mode = True
            ws.remove_file('page1_img', force=False)

    def test_remove_file_group_force(self):
        with copy_of_directory(
                assets.path_to('SBB0000F29300010000/data')) as tempdir:
            workspace = Workspace(self.resolver, directory=tempdir)
            with self.assertRaisesRegex(Exception, "No such fileGrp"):
                # should fail
                workspace.remove_file_group('I DO NOT EXIST')
            # should succeed
            workspace.remove_file_group('I DO NOT EXIST', force=True)
            # should also succeed
            workspace.overwrite_mode = True
            workspace.remove_file_group('I DO NOT EXIST', force=False)

    def test_remove_file_group_rmdir(self):
        with copy_of_directory(
                assets.path_to('SBB0000F29300010000/data')) as tempdir:
            workspace = Workspace(self.resolver, directory=tempdir)
            self.assertTrue(exists(join(tempdir, 'OCR-D-IMG')))
            workspace.remove_file_group('OCR-D-IMG', recursive=True)
            self.assertFalse(exists(join(tempdir, 'OCR-D-IMG')))

    def test_remove_file_page_recursive(self):
        with copy_of_directory(
                assets.path_to(
                    'kant_aufklaerung_1784-complex/data')) as tempdir:
            with pushd_popd(tempdir):
                ws = Workspace(self.resolver, directory=tempdir)
                self.assertEqual(len(ws.mets.find_all_files()), 119)
                ws.remove_file(
                    'OCR-D-OCR-OCRO-fraktur-SEG-LINE-tesseract-ocropy-DEWARP_0001',
                    page_recursive=True,
                    page_same_group=False,
                    keep_file=True)
                self.assertEqual(len(ws.mets.find_all_files()), 83)
                ws.remove_file('PAGE_0017_ALTO', page_recursive=True)

    def test_remove_file_page_recursive_keep_file(self):
        with copy_of_directory(
                assets.path_to(
                    'kant_aufklaerung_1784-complex/data')) as tempdir:
            with pushd_popd(tempdir):
                ws = Workspace(self.resolver, directory=tempdir)
                before = count_files()
                ws.remove_file('OCR-D-IMG-BINPAGE-sauvola_0001',
                               page_recursive=True,
                               page_same_group=False,
                               force=True)
                after = count_files()
                self.assertEqual(after, before - 2, '2 files deleted')

    def test_remove_file_page_recursive_same_group(self):
        with copy_of_directory(
                assets.path_to(
                    'kant_aufklaerung_1784-complex/data')) as tempdir:
            with pushd_popd(tempdir):
                ws = Workspace(self.resolver, directory=tempdir)
                before = count_files()
                ws.remove_file('OCR-D-IMG-BINPAGE-sauvola_0001',
                               page_recursive=True,
                               page_same_group=True,
                               force=False)
                after = count_files()
                self.assertEqual(after, before - 1, '2 file deleted')

    def test_download_to_directory_from_workspace_download_file(self):
        """
        https://github.com/OCR-D/core/issues/342
        """
        #  tempdir = mkdtemp()
        with TemporaryDirectory() as tempdir:
            ws1 = self.resolver.workspace_from_nothing(directory=tempdir)

            f1 = ws1.add_file('IMG',
                              ID='page1_img',
                              mimetype='image/tiff',
                              local_filename='test.tif',
                              content='',
                              pageId=None)
            f2 = ws1.add_file('GT',
                              ID='page1_gt',
                              mimetype='text/xml',
                              local_filename='test.xml',
                              content='',
                              pageId=None)

            self.assertEqual(f1.url, 'test.tif')
            self.assertEqual(f2.url, 'test.xml')

            # these should be no-ops
            ws1.download_file(f1)
            ws1.download_file(f2)

            self.assertEqual(f1.url, 'test.tif')
            self.assertEqual(f2.url, 'test.xml')

    def test_save_image_file(self):
        img = Image.new('RGB', (1000, 1000))
        with TemporaryDirectory() as tempdir:
            ws = self.resolver.workspace_from_nothing(directory=tempdir)
            with self.assertRaisesRegex(KeyError, ''):
                ws.save_image_file(img, 'page1_img', 'IMG', 'page1',
                                   'ceci/nest/pas/une/mimetype')
            ws.save_image_file(img, 'page1_img', 'IMG', 'page1', 'image/jpeg')
            self.assertTrue(exists(join(tempdir, 'IMG', 'page1_img.jpg')))
            # should succeed
            ws.save_image_file(img,
                               'page1_img',
                               'IMG',
                               'page1',
                               'image/jpeg',
                               force=True)
            # should also succeed
            ws.overwrite_mode = True
            ws.save_image_file(img, 'page1_img', 'IMG', 'page1', 'image/jpeg')

    def test_resolve_image_exif(self):
        with pushd_popd(assets.path_to('kant_aufklaerung_1784/data/')):
            ws = self.resolver.workspace_from_url('mets.xml')
            exif = ws.resolve_image_exif('OCR-D-IMG/INPUT_0017.tif')
            self.assertEqual(exif.compression, 'jpeg')
            self.assertEqual(exif.width, 1457)

    def test_resolve_image_as_pil(self):
        with pushd_popd(assets.path_to('kant_aufklaerung_1784/data/')):
            ws = self.resolver.workspace_from_url('mets.xml')
            img = ws.resolve_image_as_pil('OCR-D-IMG/INPUT_0017.tif')
            self.assertEqual(img.width, 1457)
            img = ws.resolve_image_as_pil('OCR-D-IMG/INPUT_0017.tif',
                                          coords=([100, 100], [50, 50]))
            self.assertEqual(img.width, 50)

    def test_image_from_page_basic(self):
        with pushd_popd(assets.path_to('gutachten/data')):
            ws = self.resolver.workspace_from_url('mets.xml')
            with open('TEMP1/PAGE_TEMP1.xml', 'r') as f:
                pcgts = parseString(f.read().encode('utf8'), silence=True)
            img, info, exif = ws.image_from_page(pcgts.get_Page(),
                                                 page_id='PHYS_0017',
                                                 feature_selector='clipped',
                                                 feature_filter='cropped')
            self.assertEquals(info['features'], 'binarized,clipped')
            img, info, exif = ws.image_from_page(pcgts.get_Page(),
                                                 page_id='PHYS_0017')
            self.assertEquals(info['features'], 'binarized,clipped')

    def test_downsample_16bit_image(self):
        with pushd_popd(tempdir=True) as tempdir:
            with gzip_open(
                    join(dirname(__file__),
                         'data/OCR-D-IMG_APBB_Mitteilungen_62.0002.tif.gz'),
                    'rb') as gzip_in:
                with open('16bit.tif', 'wb') as tif_out:
                    tif_out.write(gzip_in.read())
            ws = self.resolver.workspace_from_nothing(directory=tempdir)
            ws.add_file('IMG',
                        ID='foo',
                        url='16bit.tif',
                        mimetype='image/tiff',
                        pageId=None)
            pil_before = Image.open('16bit.tif')
            assert pil_before.mode == 'I;16'
            pil_after = ws._resolve_image_as_pil('16bit.tif')
            assert pil_after.mode == 'L'

    def test_mets_permissions(self):
        with TemporaryDirectory() as tempdir:
            ws = self.resolver.workspace_from_nothing(tempdir)
            ws.save_mets()
            mets_path = join(ws.directory, 'mets.xml')
            mask = umask(0)
            umask(mask)
            assert (stat(mets_path).st_mode) == 0o100664 & ~mask
            chmod(mets_path, 0o777)
            ws.save_mets()
            assert filemode(stat(mets_path).st_mode) == '-rwxrwxrwx'
Exemplo n.º 9
0
class TestProcessor(TestCase):
    def setUp(self):
        disableLogging()
        initLogging()
        self.resolver = Resolver()
        self.workspace = self.resolver.workspace_from_url(
            assets.url_of('SBB0000F29300010000/data/mets.xml'))

    def test_incomplete_processor(self):
        proc = IncompleteProcessor(None)
        with self.assertRaisesRegex(Exception, 'Must be implemented'):
            proc.process()

    def test_no_resolver(self):
        with self.assertRaisesRegex(Exception,
                                    'pass a resolver to create a workspace'):
            run_processor(DummyProcessor)

    def test_no_mets_url(self):
        with self.assertRaisesRegex(Exception,
                                    'pass mets_url to create a workspace'):
            run_processor(DummyProcessor, resolver=self.resolver)

    def test_no_input_file_grp(self):
        processor = run_processor(
            DummyProcessor,
            resolver=self.resolver,
            mets_url=assets.url_of('SBB0000F29300010000/data/mets.xml'))
        with self.assertRaisesRegex(Exception,
                                    'Processor is missing input fileGrp'):
            _ = processor.input_files

    def test_with_mets_url_input_files(self):
        processor = run_processor(
            DummyProcessor,
            input_file_grp='OCR-D-SEG-PAGE',
            resolver=self.resolver,
            mets_url=assets.url_of('SBB0000F29300010000/data/mets.xml'))
        self.assertEqual(len(processor.input_files), 2)
        self.assertTrue(
            all([f.mimetype == MIMETYPE_PAGE for f in processor.input_files]))

    def test_parameter(self):
        with TemporaryDirectory() as tempdir:
            jsonpath = join(tempdir, 'params.json')
            with open(jsonpath, 'w') as f:
                f.write('{"baz": "quux"}')
            with open(jsonpath, 'r') as f:
                processor = run_processor(
                    DummyProcessor,
                    parameter=json.load(f),
                    input_file_grp="OCR-D-IMG",
                    resolver=self.resolver,
                    mets_url=assets.url_of(
                        'SBB0000F29300010000/data/mets.xml'))
            self.assertEqual(len(processor.input_files), 3)

    def test_verify(self):
        proc = DummyProcessor(self.workspace)
        self.assertEqual(proc.verify(), True)

    def test_json(self):
        DummyProcessor(self.workspace, dump_json=True)

    def test_params_missing_required(self):
        with self.assertRaisesRegex(Exception, 'is a required property'):
            DummyProcessorWithRequiredParameters(workspace=self.workspace)

    def test_params(self):
        proc = Processor(workspace=self.workspace)
        self.assertEqual(proc.parameter, {})

    def test_run_agent(self):
        no_agents_before = len(self.workspace.mets.agents)
        run_processor(DummyProcessor,
                      ocrd_tool=DUMMY_TOOL,
                      workspace=self.workspace)
        self.assertEqual(len(self.workspace.mets.agents), no_agents_before + 1,
                         'one more agent')
        #  print(self.workspace.mets.agents[no_agents_before])

    def test_run_cli(self):
        with TemporaryDirectory() as tempdir:
            run_processor(DummyProcessor,
                          ocrd_tool=DUMMY_TOOL,
                          workspace=self.workspace)
            run_cli(
                'echo',
                mets_url=assets.url_of('SBB0000F29300010000/data/mets.xml'),
                resolver=Resolver(),
                workspace=None,
                page_id='page1',
                log_level='DEBUG',
                input_file_grp='INPUT',
                output_file_grp='OUTPUT',
                parameter='/path/to/param.json',
                working_dir=tempdir)
            run_cli(
                'echo',
                mets_url=assets.url_of('SBB0000F29300010000/data/mets.xml'),
                resolver=Resolver(),
            )

    def test_zip_input_files(self):
        class ZipTestProcessor(Processor):
            pass

        with pushd_popd(tempdir=True) as tempdir:
            ws = self.resolver.workspace_from_nothing(directory=tempdir)
            ws.add_file('GRP1',
                        mimetype=MIMETYPE_PAGE,
                        ID='foobar1',
                        pageId='phys_0001')
            ws.add_file('GRP2',
                        mimetype='application/alto+xml',
                        ID='foobar2',
                        pageId='phys_0001')
            ws.add_file('GRP1',
                        mimetype=MIMETYPE_PAGE,
                        ID='foobar3',
                        pageId='phys_0002')
            ws.add_file('GRP2',
                        mimetype=MIMETYPE_PAGE,
                        ID='foobar4',
                        pageId='phys_0002')
            for page_id in [None, 'phys_0001,phys_0002']:
                with self.subTest(page_id=page_id):
                    proc = ZipTestProcessor(workspace=ws,
                                            input_file_grp='GRP1,GRP2',
                                            page_id=page_id)
                    tuples = [(one.ID, two.ID)
                              for one, two in proc.zip_input_files()]
                    assert ('foobar1', 'foobar2') in tuples
                    assert ('foobar3', 'foobar4') in tuples
                    tuples = [(one.ID, two)
                              for one, two in proc.zip_input_files(
                                  mimetype=MIMETYPE_PAGE)]
                    assert ('foobar1', None) in tuples
                    tuples = [(
                        one.ID, two.ID
                    ) for one, two in proc.zip_input_files(
                        mimetype=r'//application/(vnd.prima.page|alto)\+xml')]
                    assert ('foobar1', 'foobar2') in tuples
                    assert ('foobar3', 'foobar4') in tuples

    def test_zip_input_files_multi_mixed(self):
        class ZipTestProcessor(Processor):
            pass

        with pushd_popd(tempdir=True) as tempdir:
            ws = self.resolver.workspace_from_nothing(directory=tempdir)
            ws.add_file('GRP1',
                        mimetype=MIMETYPE_PAGE,
                        ID='foobar1',
                        pageId='phys_0001')
            ws.add_file('GRP1',
                        mimetype='image/png',
                        ID='foobar1img1',
                        pageId='phys_0001')
            ws.add_file('GRP1',
                        mimetype='image/png',
                        ID='foobar1img2',
                        pageId='phys_0001')
            ws.add_file('GRP2',
                        mimetype=MIMETYPE_PAGE,
                        ID='foobar2',
                        pageId='phys_0001')
            ws.add_file('GRP1',
                        mimetype=MIMETYPE_PAGE,
                        ID='foobar3',
                        pageId='phys_0002')
            ws.add_file('GRP2',
                        mimetype='image/tiff',
                        ID='foobar4',
                        pageId='phys_0002')
            for page_id in [None, 'phys_0001,phys_0002']:
                with self.subTest(page_id=page_id):
                    proc = ZipTestProcessor(workspace=ws,
                                            input_file_grp='GRP1,GRP2',
                                            page_id=page_id)
                    print("unfiltered")
                    tuples = [(one.ID, two.ID)
                              for one, two in proc.zip_input_files()]
                    assert ('foobar1', 'foobar2') in tuples
                    assert ('foobar3', 'foobar4') in tuples
                    print("PAGE-filtered")
                    tuples = [(one.ID, two)
                              for one, two in proc.zip_input_files(
                                  mimetype=MIMETYPE_PAGE)]
                    assert ('foobar3', None) in tuples
            ws.add_file('GRP2',
                        mimetype='image/tiff',
                        ID='foobar4dup',
                        pageId='phys_0002')
            for page_id in [None, 'phys_0001,phys_0002']:
                with self.subTest(page_id=page_id):
                    proc = ZipTestProcessor(workspace=ws,
                                            input_file_grp='GRP1,GRP2',
                                            page_id=page_id)
                    tuples = [
                        (one.ID, two.ID)
                        for one, two in proc.zip_input_files(on_error='first')
                    ]
                    assert ('foobar1', 'foobar2') in tuples
                    assert ('foobar3', 'foobar4') in tuples
                    tuples = [
                        (one.ID, two)
                        for one, two in proc.zip_input_files(on_error='skip')
                    ]
                    assert ('foobar3', None) in tuples
                    with self.assertRaisesRegex(
                            Exception,
                            "No PAGE-XML for page .* in fileGrp .* but multiple matches."
                    ):
                        tuples = proc.zip_input_files(on_error='abort')
            ws.add_file('GRP2',
                        mimetype=MIMETYPE_PAGE,
                        ID='foobar2dup',
                        pageId='phys_0001')
            for page_id in [None, 'phys_0001,phys_0002']:
                with self.subTest(page_id=page_id):
                    proc = ZipTestProcessor(workspace=ws,
                                            input_file_grp='GRP1,GRP2',
                                            page_id=page_id)
                    with self.assertRaisesRegex(
                            Exception, "Multiple PAGE-XML matches for page"):
                        tuples = proc.zip_input_files()

    def test_zip_input_files_require_first(self):
        class ZipTestProcessor(Processor):
            pass

        self.capture_out_err()
        with pushd_popd(tempdir=True) as tempdir:
            ws = self.resolver.workspace_from_nothing(directory=tempdir)
            ws.add_file('GRP1',
                        mimetype=MIMETYPE_PAGE,
                        ID='foobar1',
                        pageId=None)
            ws.add_file('GRP2',
                        mimetype=MIMETYPE_PAGE,
                        ID='foobar2',
                        pageId='phys_0001')
            for page_id in [None, 'phys_0001,phys_0002']:
                with self.subTest(page_id=page_id):
                    proc = ZipTestProcessor(workspace=ws,
                                            input_file_grp='GRP1,GRP2',
                                            page_id=page_id)
                    assert [(one, two.ID) for one, two in proc.zip_input_files(
                        require_first=False)] == [(None, 'foobar2')]
        r = self.capture_out_err()
        assert 'ERROR ocrd.processor.base - found no page phys_0001 in file group GRP1' in r.err
Exemplo n.º 10
0
class TestWorkspace(TestCase):

    def setUp(self):
        self.resolver = Resolver()

    def test_workspace_add_file(self):
        with TemporaryDirectory() as tempdir:
            ws1 = self.resolver.workspace_from_nothing(directory=tempdir)
            fpath = join(tempdir, 'ID1.tif')
            ws1.add_file(
                'GRP',
                ID='ID1',
                mimetype='image/tiff',
                content='CONTENT',
                local_filename=fpath
            )
            f = ws1.mets.find_files()[0]
            self.assertEqual(f.ID, 'ID1')
            self.assertEqual(f.mimetype, 'image/tiff')
            self.assertEqual(f.url, fpath)
            self.assertEqual(f.local_filename, fpath)
            self.assertTrue(exists(fpath))

    def test_workspace_add_file_basename_no_content(self):
        with TemporaryDirectory() as tempdir:
            ws1 = self.resolver.workspace_from_nothing(directory=tempdir)
            ws1.add_file('GRP', ID='ID1', mimetype='image/tiff')
            f = ws1.mets.find_files()[0]
            self.assertEqual(f.url, None)

    def test_workspace_add_file_binary_content(self):
        with TemporaryDirectory() as tempdir:
            ws1 = self.resolver.workspace_from_nothing(directory=tempdir)
            fpath = join(tempdir, 'subdir', 'ID1.tif')
            ws1.add_file('GRP', ID='ID1', content=b'CONTENT', local_filename=fpath, url='http://foo/bar')
            self.assertTrue(exists(fpath))

    def test_workspacec_add_file_content_wo_local_filename(self):
        with TemporaryDirectory() as tempdir:
            ws1 = self.resolver.workspace_from_nothing(directory=tempdir)
            with self.assertRaisesRegex(Exception, "'content' was set but no 'local_filename'"):
                ws1.add_file('GRP', ID='ID1', content=b'CONTENT')


    def test_workspace_str(self):
        with TemporaryDirectory() as tempdir:
            ws1 = self.resolver.workspace_from_nothing(directory=tempdir)
            ws1.save_mets()
            ws1.reload_mets()
            self.assertEqual(str(ws1), 'Workspace[directory=%s, baseurl=None, file_groups=[], files=[]]' % tempdir)

    def test_workspace_backup(self):
        with TemporaryDirectory() as tempdir:
            ws1 = self.resolver.workspace_from_nothing(directory=tempdir)
            ws1.automatic_backup = True
            ws1.save_mets()
            ws1.reload_mets()
            self.assertEqual(str(ws1), 'Workspace[directory=%s, baseurl=None, file_groups=[], files=[]]' % tempdir)

    def test_download_url0(self):
        with TemporaryDirectory() as directory:
            ws1 = self.resolver.workspace_from_nothing(directory)
            fn = ws1.download_url(abspath(__file__))
            self.assertEqual(fn, join('TEMP', basename(__file__)))

    def test_download_url_without_baseurl(self):
        with TemporaryDirectory() as tempdir:
            dst_mets = join(tempdir, 'mets.xml')
            copyfile(SRC_METS, dst_mets)
            ws1 = self.resolver.workspace_from_url(dst_mets)
            with self.assertRaisesRegex(Exception, "Already tried prepending baseurl '%s'" % tempdir):
                ws1.download_url(SAMPLE_FILE_URL)

    def test_download_url_with_baseurl(self):
        with TemporaryDirectory() as tempdir:
            dst_mets = join(tempdir, 'mets.xml')
            copyfile(SRC_METS, dst_mets)
            ws1 = self.resolver.workspace_from_url(dst_mets, src_baseurl=dirname(SRC_METS))
            f = Path(ws1.download_url(SAMPLE_FILE_URL))
            self.assertEqual(f, Path('TEMP', '%s.tif' % SAMPLE_FILE_ID))
            self.assertTrue(Path(ws1.directory, f).exists())

    def test_227_1(self):
        def find_recursive(root):
            ret = []
            for _, _, f in walk(root):
                for file in f:
                    ret.append(file)
            return ret
        with TemporaryDirectory() as wsdir:
            with open(assets.path_to('SBB0000F29300010000/data/mets_one_file.xml'), 'r') as f_in:
                with open(join(wsdir, 'mets.xml'), 'w') as f_out:
                    f_out.write(f_in.read())
            self.assertEqual(len(find_recursive(wsdir)), 1)
            ws1 = Workspace(self.resolver, wsdir)
            for file in ws1.mets.find_files():
                ws1.download_file(file)
            self.assertEqual(len(find_recursive(wsdir)), 2)
            self.assertTrue(exists(join(wsdir, 'OCR-D-IMG/FILE_0005_IMAGE.tif')))
Exemplo n.º 11
0
class TestWorkspace(TestCase):
    def setUp(self):
        self.resolver = Resolver()

    def test_workspace_add_file(self):
        with TemporaryDirectory() as tempdir:
            ws1 = self.resolver.workspace_from_nothing(directory=tempdir)
            fpath = join(tempdir, 'ID1.tif')
            ws1.add_file('GRP',
                         ID='ID1',
                         mimetype='image/tiff',
                         content='CONTENT',
                         local_filename=fpath)
            f = ws1.mets.find_files()[0]
            self.assertEqual(f.ID, 'ID1')
            self.assertEqual(f.mimetype, 'image/tiff')
            self.assertEqual(f.url, fpath)
            self.assertEqual(f.local_filename, fpath)
            self.assertTrue(exists(fpath))

    def test_workspace_add_file_basename_no_content(self):
        with TemporaryDirectory() as tempdir:
            ws1 = self.resolver.workspace_from_nothing(directory=tempdir)
            ws1.add_file('GRP', ID='ID1', mimetype='image/tiff')
            f = ws1.mets.find_files()[0]
            self.assertEqual(f.url, None)

    def test_workspace_add_file_binary_content(self):
        with TemporaryDirectory() as tempdir:
            ws1 = self.resolver.workspace_from_nothing(directory=tempdir)
            fpath = join(tempdir, 'subdir', 'ID1.tif')
            ws1.add_file('GRP',
                         ID='ID1',
                         content=b'CONTENT',
                         local_filename=fpath,
                         url='http://foo/bar')
            self.assertTrue(exists(fpath))

    def test_workspacec_add_file_content_wo_local_filename(self):
        with TemporaryDirectory() as tempdir:
            ws1 = self.resolver.workspace_from_nothing(directory=tempdir)
            with self.assertRaisesRegex(
                    Exception, "'content' was set but no 'local_filename'"):
                ws1.add_file('GRP', ID='ID1', content=b'CONTENT')

    def test_workspace_str(self):
        with TemporaryDirectory() as tempdir:
            ws1 = self.resolver.workspace_from_nothing(directory=tempdir)
            ws1.save_mets()
            ws1.reload_mets()
            self.assertEqual(
                str(ws1),
                'Workspace[directory=%s, baseurl=None, file_groups=[], files=[]]'
                % tempdir)

    def test_workspace_backup(self):
        with TemporaryDirectory() as tempdir:
            ws1 = self.resolver.workspace_from_nothing(directory=tempdir)
            ws1.automatic_backup = True
            ws1.save_mets()
            ws1.reload_mets()
            self.assertEqual(
                str(ws1),
                'Workspace[directory=%s, baseurl=None, file_groups=[], files=[]]'
                % tempdir)

    def test_download_url0(self):
        with TemporaryDirectory() as directory:
            ws1 = self.resolver.workspace_from_nothing(directory)
            fn = ws1.download_url(abspath(__file__))
            self.assertEqual(fn, join('TEMP', basename(__file__)))

    def test_download_url_without_baseurl(self):
        with TemporaryDirectory() as tempdir:
            dst_mets = join(tempdir, 'mets.xml')
            copyfile(SRC_METS, dst_mets)
            ws1 = self.resolver.workspace_from_url(dst_mets)
            with self.assertRaisesRegex(
                    Exception,
                    "Already tried prepending baseurl '%s'" % tempdir):
                ws1.download_url(SAMPLE_FILE_URL)

    def test_download_url_with_baseurl(self):
        with TemporaryDirectory() as tempdir:
            dst_mets = join(tempdir, 'mets.xml')
            copyfile(SRC_METS, dst_mets)
            ws1 = self.resolver.workspace_from_url(
                dst_mets, src_baseurl=dirname(SRC_METS))
            f = Path(ws1.download_url(SAMPLE_FILE_URL))
            self.assertEqual(f, Path('TEMP', '%s.tif' % SAMPLE_FILE_ID))
            self.assertTrue(Path(ws1.directory, f).exists())

    def test_from_url_dst_dir_download(self):
        """
        https://github.com/OCR-D/core/issues/319
        """
        with TemporaryDirectory() as tempdir:
            ws_dir = join(tempdir, 'non-existing-for-good-measure')
            # Create a relative path to trigger #319
            src_path = str(
                Path(assets.path_to(
                    'kant_aufklaerung_1784/data/mets.xml')).relative_to(
                        Path.cwd()))
            self.resolver.workspace_from_url(src_path,
                                             dst_dir=ws_dir,
                                             download=True)
            from os import system
            system('find %s' % ws_dir)
            self.assertTrue(
                Path(ws_dir,
                     'mets.xml').exists())  # sanity check, mets.xml must exist
            self.assertTrue(
                Path(ws_dir, 'OCR-D-GT-PAGE/PAGE_0017_PAGE.xml').exists())

    def test_superfluous_copies_in_ws_dir(self):
        """
        https://github.com/OCR-D/core/issues/227
        """
        def find_recursive(root):
            ret = []
            for _, _, f in walk(root):
                for file in f:
                    ret.append(file)
            return ret

        with TemporaryDirectory() as wsdir:
            with open(
                    assets.path_to(
                        'SBB0000F29300010000/data/mets_one_file.xml'),
                    'r') as f_in:
                with open(join(wsdir, 'mets.xml'), 'w') as f_out:
                    f_out.write(f_in.read())
            self.assertEqual(len(find_recursive(wsdir)), 1)
            ws1 = Workspace(self.resolver, wsdir)
            for file in ws1.mets.find_files():
                ws1.download_file(file)
            self.assertEqual(len(find_recursive(wsdir)), 2)
            self.assertTrue(
                exists(join(wsdir, 'OCR-D-IMG/FILE_0005_IMAGE.tif')))

    #  def test_remove(self):
    #      with TemporaryDirectory() as tempdir:
    #          dst_dir =
    #          ws1 = self.resolver.workspace_from_url(SRC_METS, dst_dir=dst_dir)
    #          res = ws1.download_url(SAMPLE_FILE_URL)
    #          print('>>>>>> %s' % res)
    #          ocrd_file = ws1.remove_file(SAMPLE_FILE_ID)
    #          print(ocrd_file)
    #          import os
    #          self.assertTrue(exists(join(ws1.directory, ocrd_file.local_filename)))
    #          #  with copy_of_directory(FOLDER_KANT) as tempdir:

    def test_download_to_directory_from_workspace_download_file(self):
        """
        https://github.com/OCR-D/core/issues/342
        """
        #  tempdir = mkdtemp()
        with TemporaryDirectory() as tempdir:
            ws1 = self.resolver.workspace_from_nothing(directory=tempdir)

            f1 = ws1.add_file('IMG',
                              ID='page1_img',
                              mimetype='image/tiff',
                              local_filename='test.tif',
                              content='')
            f2 = ws1.add_file('GT',
                              ID='page1_gt',
                              mimetype='text/xml',
                              local_filename='test.xml',
                              content='')

            self.assertEqual(f1.url, 'test.tif')
            self.assertEqual(f2.url, 'test.xml')

            # these should be no-ops
            ws1.download_file(f1)
            ws1.download_file(f2)

            self.assertEqual(f1.url, 'test.tif')
            self.assertEqual(f2.url, 'test.xml')
Exemplo n.º 12
0
class TestWorkspaceValidator(TestCase):
    def setUp(self):
        self.resolver = Resolver()

    def test_simple(self):
        report = WorkspaceValidator.validate(
            self.resolver,
            assets.url_of('SBB0000F29300010000/data/mets_one_file.xml'),
            download=True)
        self.assertTrue(report.is_valid)

    def test_validate_twice(self):
        validator = WorkspaceValidator(
            self.resolver,
            assets.url_of('SBB0000F29300010000/data/mets_one_file.xml'),
            download=True)
        report = validator._validate()  # pylint: disable=protected-access
        report = validator._validate()  # pylint: disable=protected-access
        self.assertTrue(report.is_valid)

    def test_validate_empty(self):
        with TemporaryDirectory() as tempdir:
            workspace = self.resolver.workspace_from_nothing(directory=tempdir)
            report = WorkspaceValidator.validate(self.resolver,
                                                 join(tempdir, 'mets.xml'))
            self.assertEqual(len(report.errors), 2)
            self.assertIn('no unique identifier', report.errors[0])
            self.assertIn('No files', report.errors[1])
            workspace.mets.unique_identifier = 'foobar'
            workspace.save_mets()
            report = WorkspaceValidator.validate(self.resolver,
                                                 join(tempdir, 'mets.xml'))
            self.assertEqual(len(report.errors), 1)

    def test_validate_file_groups_non_ocrd(self):
        with TemporaryDirectory() as tempdir:
            workspace = self.resolver.workspace_from_nothing(directory=tempdir)
            workspace.mets.unique_identifier = 'foobar'
            workspace.mets.add_file_group('FOO')
            workspace.save_mets()
            report = WorkspaceValidator.validate(self.resolver,
                                                 join(tempdir, 'mets.xml'))
            self.assertEqual(len(report.errors), 1)
            self.assertIn('No files', report.errors[0])
            self.assertEqual(len(report.notices), 1)
            self.assertIn("USE does not begin with 'OCR-D-'",
                          report.notices[0])

    def test_validate_file_groups_unspecified(self):
        with TemporaryDirectory() as tempdir:
            workspace = self.resolver.workspace_from_nothing(directory=tempdir)
            workspace.mets.unique_identifier = 'foobar'
            workspace.mets.add_file_group('OCR-D-INVALID-FILEGRP')
            workspace.save_mets()
            report = WorkspaceValidator.validate(self.resolver,
                                                 join(tempdir, 'mets.xml'))
            self.assertEqual(len(report.errors), 1)
            self.assertEqual(len(report.warnings), 1)
            self.assertEqual(
                report.warnings[0],
                "Unspecified USE category 'INVALID' in fileGrp 'OCR-D-INVALID-FILEGRP'"
            )
            self.assertIn('No files', report.errors[0])

    def test_validate_file_groups_bad_name(self):
        with TemporaryDirectory() as tempdir:
            workspace = self.resolver.workspace_from_nothing(directory=tempdir)
            workspace.mets.unique_identifier = 'foobar'
            workspace.mets.add_file_group('OCR-D-GT-X')
            workspace.save_mets()
            report = WorkspaceValidator.validate(self.resolver,
                                                 join(tempdir, 'mets.xml'))
            self.assertEqual(len(report.errors), 1)
            self.assertEqual(len(report.warnings), 1)
            self.assertIn("Invalid USE name 'X' in fileGrp",
                          report.warnings[0])
            self.assertIn('No files', report.errors[0])

    def test_validate_files_nopageid(self):
        with TemporaryDirectory() as tempdir:
            workspace = self.resolver.workspace_from_nothing(directory=tempdir)
            workspace.mets.unique_identifier = 'foobar'
            workspace.mets.add_file('OCR-D-GT-PAGE',
                                    ID='file1',
                                    mimetype='image/png',
                                    url='http://foo')
            workspace.save_mets()
            report = WorkspaceValidator.validate(
                self.resolver,
                join(tempdir, 'mets.xml'),
                skip=['pixel_density', 'imagefilename'])
            self.assertEqual(len(report.errors), 1)
            self.assertIn("does not manifest any physical page.",
                          report.errors[0])

    def test_validate_weird_urls(self):
        with TemporaryDirectory() as tempdir:
            workspace = self.resolver.workspace_from_nothing(directory=tempdir)
            workspace.mets.unique_identifier = 'foobar'
            workspace.mets.add_file('OCR-D-GT-PAGE',
                                    ID='file1',
                                    mimetype='image/png',
                                    pageId='page1',
                                    url='file:/java-file-url')
            f = workspace.mets.add_file('OCR-D-GT-PAGE',
                                        ID='file2',
                                        mimetype='image/png',
                                        pageId='page2',
                                        url='nothttp://unusual.scheme')
            f._el.set('GROUPID', 'donotuse')  # pylint: disable=protected-access
            workspace.save_mets()
            report = WorkspaceValidator.validate(self.resolver,
                                                 join(tempdir, 'mets.xml'),
                                                 skip=['pixel_density'])
            self.assertEqual(len(report.errors), 1)
            self.assertIn("Invalid (java) URL", report.errors[0])

    def test_validate_pixel_no_download(self):
        imgpath = assets.path_to(
            'kant_aufklaerung_1784-binarized/data/OCR-D-IMG-BIN/BIN_0020.png')
        with TemporaryDirectory() as tempdir:
            workspace = self.resolver.workspace_from_nothing(directory=tempdir)
            workspace.mets.unique_identifier = 'foobar'
            workspace.mets.add_file('OCR-D-GT-BIN',
                                    ID='file1',
                                    mimetype='image/png',
                                    pageId='page1',
                                    url=imgpath)
            workspace.save_mets()
            report = WorkspaceValidator.validate(self.resolver,
                                                 join(tempdir, 'mets.xml'),
                                                 skip=[],
                                                 download=False)
            self.assertEqual(len(report.errors), 0)
            self.assertEqual(len(report.warnings), 0)
            self.assertEqual(len(report.notices), 0)

    def test_validate_pixel_density_too_low(self):
        imgpath = assets.path_to(
            'kant_aufklaerung_1784-binarized/data/OCR-D-IMG-BIN/BIN_0017.png')
        with TemporaryDirectory() as tempdir:
            workspace = self.resolver.workspace_from_nothing(directory=tempdir)
            workspace.mets.unique_identifier = 'foobar'
            workspace.mets.add_file('OCR-D-GT-BIN',
                                    ID='file1',
                                    mimetype='image/png',
                                    pageId='page1',
                                    url=imgpath)
            workspace.save_mets()
            report = WorkspaceValidator.validate(self.resolver,
                                                 join(tempdir, 'mets.xml'),
                                                 skip=[],
                                                 download=True)
            self.assertEqual(len(report.notices), 2)
            self.assertIn("xResolution", report.notices[0])
            self.assertIn("yResolution", report.notices[1])
            self.assertEqual(len(report.warnings), 0)
            self.assertEqual(len(report.errors), 0)

    def test_bad_workspace(self):
        report = WorkspaceValidator.validate(self.resolver, 'non existe')
        self.assertFalse(report.is_valid)
        self.assertIn('Failed to instantiate workspace:', report.errors[0])

    def test_skip_page(self):
        report = WorkspaceValidator.validate(
            self.resolver,
            None,
            src_dir=assets.path_to('kant_aufklaerung_1784/data'),
            download=True,
            skip=[
                'page',
                'mets_unique_identifier',
                'mets_file_group_names',
                'mets_files',
                'pixel_density',
                'imagefilename',
            ])
        self.assertTrue(report.is_valid)

    def test_dimensions(self):
        with TemporaryDirectory() as tempdir:
            wsdir = join(tempdir, 'foo')
            copytree(assets.path_to('kant_aufklaerung_1784/data'), wsdir)
            with pushd_popd(wsdir):
                os.system(
                    """sed -i 's,imageHeight="2083",imageHeight="1234",' OCR-D-GT-PAGE/PAGE_0017_PAGE.xml"""
                )
                report = WorkspaceValidator.validate(
                    self.resolver,
                    join(wsdir, 'mets.xml'),
                    src_dir=wsdir,
                    skip=[
                        'page', 'mets_unique_identifier',
                        'mets_file_group_names', 'mets_files', 'pixel_density',
                        'imagefilename'
                    ],
                    download=False)
                self.assertIn(
                    "PAGE 'PAGE_0017_PAGE': @imageHeight != image's actual height (1234 != 2083)",
                    report.errors)
                print(report.errors)
                self.assertEqual(len(report.errors), 1)
                self.assertEqual(report.is_valid, False)
                report2 = WorkspaceValidator.validate(
                    self.resolver,
                    join(wsdir, 'mets.xml'),
                    src_dir=wsdir,
                    skip=[
                        'page', 'mets_unique_identifier',
                        'mets_file_group_names', 'mets_files', 'pixel_density',
                        'imagefilename', 'dimension'
                    ],
                    download=False)
            self.assertEqual(report2.is_valid, True)

    def test_src_dir(self):
        report = WorkspaceValidator.validate(
            self.resolver,
            None,
            src_dir=assets.path_to('kant_aufklaerung_1784/data'),
            skip=['imagefilename'],
            download=True,
        )
        print(report.errors)
        self.assertEqual(
            len([e for e in report.errors if isinstance(e, ConsistencyError)]),
            42, '42 textequiv consistency errors')

    def test_imagefilename(self):
        report = WorkspaceValidator.validate(
            self.resolver,
            None,
            src_dir=assets.path_to('kant_aufklaerung_1784/data'),
            skip=[
                'page', 'mets_unique_identifier', 'mets_file_group_names',
                'mets_files', 'pixel_density'
            ],
            download=False,
        )
        self.assertEqual(len(report.errors), 0)
Exemplo n.º 13
0
class TestWorkspaceValidator(TestCase):
    def setUp(self):
        self.resolver = Resolver()

    def test_simple(self):
        report = WorkspaceValidator.validate(
            self.resolver,
            assets.url_of('SBB0000F29300010000/data/mets_one_file.xml'),
            download=True)
        self.assertTrue(report.is_valid)

    def test_validate_twice(self):
        validator = WorkspaceValidator(
            self.resolver,
            assets.url_of('SBB0000F29300010000/data/mets_one_file.xml'),
            download=True)
        report = validator._validate()  # pylint: disable=protected-access
        report = validator._validate()  # pylint: disable=protected-access
        self.assertTrue(report.is_valid)

    def test_validate_empty(self):
        with TemporaryDirectory() as tempdir:
            workspace = self.resolver.workspace_from_nothing(directory=tempdir)
            report = WorkspaceValidator.validate(self.resolver,
                                                 join(tempdir, 'mets.xml'))
            self.assertEqual(len(report.errors), 2)
            self.assertIn('no unique identifier', report.errors[0])
            self.assertIn('No files', report.errors[1])
            workspace.mets.unique_identifier = 'foobar'
            workspace.save_mets()
            report = WorkspaceValidator.validate(self.resolver,
                                                 join(tempdir, 'mets.xml'))
            self.assertEqual(len(report.errors), 1)

    def test_validate_file_groups_non_ocrd(self):
        with TemporaryDirectory() as tempdir:
            workspace = self.resolver.workspace_from_nothing(directory=tempdir)
            workspace.mets.unique_identifier = 'foobar'
            workspace.mets.add_file_group('FOO')
            workspace.save_mets()
            report = WorkspaceValidator.validate(self.resolver,
                                                 join(tempdir, 'mets.xml'))
            self.assertEqual(len(report.errors), 1)
            self.assertIn('No files', report.errors[0])
            self.assertEqual(len(report.notices), 1)
            self.assertIn("USE does not begin with 'OCR-D-'",
                          report.notices[0])

    def test_validate_file_groups_unspecified(self):
        with TemporaryDirectory() as tempdir:
            workspace = self.resolver.workspace_from_nothing(directory=tempdir)
            workspace.mets.unique_identifier = 'foobar'
            workspace.mets.add_file_group('OCR-D-INVALID-FILEGRP')
            workspace.save_mets()
            report = WorkspaceValidator.validate(self.resolver,
                                                 join(tempdir, 'mets.xml'))
            self.assertEqual(len(report.errors), 2)
            self.assertEqual(
                report.errors[0],
                "Unspecified USE category 'INVALID' in fileGrp 'OCR-D-INVALID-FILEGRP'"
            )
            self.assertIn('No files', report.errors[1])

    def test_validate_file_groups_bad_name(self):
        with TemporaryDirectory() as tempdir:
            workspace = self.resolver.workspace_from_nothing(directory=tempdir)
            workspace.mets.unique_identifier = 'foobar'
            workspace.mets.add_file_group('OCR-D-GT-X')
            workspace.save_mets()
            report = WorkspaceValidator.validate(self.resolver,
                                                 join(tempdir, 'mets.xml'))
            self.assertEqual(len(report.errors), 2)
            self.assertIn("Invalid USE name 'X' in fileGrp", report.errors[0])
            self.assertIn('No files', report.errors[1])

    def test_validate_files_nopageid(self):
        with TemporaryDirectory() as tempdir:
            workspace = self.resolver.workspace_from_nothing(directory=tempdir)
            workspace.mets.unique_identifier = 'foobar'
            workspace.mets.add_file('OCR-D-GT-PAGE',
                                    ID='file1',
                                    mimetype='image/png')
            workspace.save_mets()
            report = WorkspaceValidator.validate(self.resolver,
                                                 join(tempdir, 'mets.xml'),
                                                 skip=['pixel_density'])
            self.assertEqual(len(report.errors), 1)
            self.assertIn("does not manifest any physical page.",
                          report.errors[0])

    def test_validate_weird_urls(self):
        with TemporaryDirectory() as tempdir:
            workspace = self.resolver.workspace_from_nothing(directory=tempdir)
            workspace.mets.unique_identifier = 'foobar'
            workspace.mets.add_file('OCR-D-GT-PAGE',
                                    ID='file1',
                                    mimetype='image/png',
                                    pageId='page1',
                                    url='file:/java-file-url')
            f = workspace.mets.add_file('OCR-D-GT-PAGE',
                                        ID='file2',
                                        mimetype='image/png',
                                        pageId='page2',
                                        url='nothttp://unusual.scheme')
            f._el.set('GROUPID', 'donotuse')  # pylint: disable=protected-access
            workspace.save_mets()
            report = WorkspaceValidator.validate(self.resolver,
                                                 join(tempdir, 'mets.xml'),
                                                 skip=['pixel_density'])
            self.assertEqual(len(report.errors), 0)
            self.assertEqual(len(report.warnings), 2)
            self.assertIn("Java-specific", report.warnings[0])
            self.assertIn("non-HTTP", report.warnings[1])
            self.assertEqual(len(report.notices), 1)
            self.assertIn("has GROUPID attribute", report.notices[0])

    def test_validate_pixel_no_download(self):
        imgpath = assets.path_to(
            'kant_aufklaerung_1784-binarized/data/OCR-D-IMG-BIN/BIN_0020')
        with TemporaryDirectory() as tempdir:
            workspace = self.resolver.workspace_from_nothing(directory=tempdir)
            workspace.mets.unique_identifier = 'foobar'
            workspace.mets.add_file('OCR-D-GT-BIN',
                                    ID='file1',
                                    mimetype='image/png',
                                    pageId='page1',
                                    url='file://%s' % imgpath)
            workspace.save_mets()
            report = WorkspaceValidator.validate(self.resolver,
                                                 join(tempdir, 'mets.xml'),
                                                 skip=[])
            self.assertEqual(len(report.errors), 0)
            self.assertEqual(len(report.warnings), 0)
            self.assertEqual(len(report.notices), 1)
            self.assertIn("Won't download remote image", report.notices[0])

    def test_validate_pixel_density_too_low(self):
        imgpath = assets.path_to(
            'kant_aufklaerung_1784-binarized/data/OCR-D-IMG-BIN/BIN_0017')
        with TemporaryDirectory() as tempdir:
            workspace = self.resolver.workspace_from_nothing(directory=tempdir)
            workspace.mets.unique_identifier = 'foobar'
            workspace.mets.add_file('OCR-D-GT-BIN',
                                    ID='file1',
                                    mimetype='image/png',
                                    pageId='page1',
                                    url='file://%s' % imgpath)
            workspace.save_mets()
            report = WorkspaceValidator.validate(self.resolver,
                                                 join(tempdir, 'mets.xml'),
                                                 skip=[],
                                                 download=True)
            self.assertEqual(len(report.errors), 2)
            self.assertIn("xResolution", report.errors[0])
            self.assertIn("yResolution", report.errors[1])
            self.assertEqual(len(report.warnings), 0)
            self.assertEqual(len(report.notices), 0)

    def test_bad_workspace(self):
        report = WorkspaceValidator.validate(self.resolver, 'non existe')
        self.assertFalse(report.is_valid)
        self.assertIn('Failed to instantiate workspace:', report.errors[0])

    def test_skip_page(self):
        report = WorkspaceValidator.validate(
            self.resolver,
            None,
            src_dir=assets.path_to('kant_aufklaerung_1784/data'),
            download=True,
            skip=[
                'page',
                'mets_unique_identifier',
                'mets_file_group_names',
                'mets_files',
                'pixel_density',
            ])
        self.assertTrue(report.is_valid)

    def test_src_dir(self):
        report = WorkspaceValidator.validate(
            self.resolver,
            None,
            src_dir=assets.path_to('kant_aufklaerung_1784/data'),
            download=True,
        )
        self.assertEqual(len(report.errors), 42)