Пример #1
0
 def test_remove_file_group_regex(self):
     """
     Test removal of filegrp
     """
     with copy_of_directory(assets.path_to('SBB0000F29300010000/data')) as tempdir:
         mets = OcrdMets(filename=join(tempdir, 'mets.xml'))
         self.assertEqual(len(mets.file_groups), 17)
         self.assertEqual(len(mets.find_all_files()), 35)
         mets.remove_file_group('//OCR-D-GT-.*', recursive=True)
         self.assertEqual(len(mets.file_groups), 15)
         self.assertEqual(len(mets.find_all_files()), 31)
Пример #2
0
 def test_remove_file_ocrdfile(self):
     with copy_of_directory(assets.path_to('SBB0000F29300010000/data')) as tempdir:
         mets = OcrdMets(filename=join(tempdir, 'mets.xml'))
         self.assertEqual(mets.physical_pages, ['PHYS_0001', 'PHYS_0002', 'PHYS_0005'])
         ocrd_file = mets.find_all_files(ID='FILE_0005_IMAGE')[0]
         mets.remove_one_file(ocrd_file)
         self.assertEqual(mets.physical_pages, ['PHYS_0001', 'PHYS_0002'])
Пример #3
0
 def test_remove_file_group0(self):
     """
     Test removal of filegrp
     """
     with copy_of_directory(assets.path_to('SBB0000F29300010000/data')) as tempdir:
         mets = OcrdMets(filename=join(tempdir, 'mets.xml'))
         self.assertEqual(len(mets.file_groups), 17)
         self.assertEqual(len(mets.find_all_files()), 35)
         #  print()
         #  before = sorted([x.ID for x in mets.find_all_files()])
         with self.assertRaisesRegex(Exception, "not empty"):
             mets.remove_file_group('OCR-D-GT-ALTO')
         mets.remove_file_group('OCR-D-GT-PAGE', recursive=True)
         #  print([x for x in before if x not in sorted([x.ID for x in mets.find_all_files()])])
         self.assertEqual(len(mets.file_groups), 16)
         self.assertEqual(len(mets.find_all_files()), 33)
Пример #4
0
class TestOcrdMets(TestCase):

    def setUp(self):
        super().setUp()
        self.mets = OcrdMets(filename=assets.url_of('SBB0000F29300010000/data/mets.xml'))

    def test_unique_identifier(self):
        self.assertEqual(self.mets.unique_identifier, 'http://resolver.staatsbibliothek-berlin.de/SBB0000F29300010000', 'Right identifier')
        self.mets.unique_identifier = 'foo'
        self.assertEqual(self.mets.unique_identifier, 'foo', 'Right identifier after change')

    # pylint: disable=no-member
    def test_unique_identifier_from_nothing(self):
        mets = OcrdMets.empty_mets(datetime.now().isoformat())
        self.assertEqual(mets.unique_identifier, None, 'no identifier')
        mets.unique_identifier = 'foo'
        self.assertEqual(mets.unique_identifier, 'foo', 'Right identifier after change')
        as_string = mets.to_xml().decode('utf-8')
        self.assertIn('ocrd/core v%s' % VERSION, as_string)
        self.assertIn('CREATEDATE="%04u-%02u-%02uT' % (
            datetime.now().year,
            datetime.now().month,
            datetime.now().day,
        ), as_string)

    def test_str(self):
        mets = OcrdMets(content='<mets/>')
        self.assertEqual(str(mets), 'OcrdMets[fileGrps=[],files=[]]')

    #  def test_override_constructor_args(self):
    #      id2file = {'foo': {}}
    #      mets = OcrdMets(id2file, content='<mets/>')
    #      self.assertEqual(mets._file_by_id, id2file)

    def test_file_groups(self):
        self.assertEqual(len(self.mets.file_groups), 17, '17 file groups')

    def test_find_all_files(self):
        self.assertEqual(len(self.mets.find_all_files()), 35, '35 files total')
        self.assertEqual(len(self.mets.find_all_files(fileGrp='OCR-D-IMG')), 3, '3 files in "OCR-D-IMG"')
        self.assertEqual(len(self.mets.find_all_files(fileGrp='//OCR-D-I.*')), 13, '13 files in "//OCR-D-I.*"')
        self.assertEqual(len(self.mets.find_all_files(ID="FILE_0001_IMAGE")), 1, '1 files with ID "FILE_0001_IMAGE"')
        self.assertEqual(len(self.mets.find_all_files(ID="//FILE_0005_.*")), 1, '1 files with ID "//FILE_0005_.*"')
        self.assertEqual(len(self.mets.find_all_files(pageId='PHYS_0001')), 17, '17 files for page "PHYS_0001"')
        self.assertEqual(len(self.mets.find_all_files(pageId='PHYS_0001-NOTEXIST')), 0, '0 pages for "PHYS_0001-NOTEXIST"')
        self.assertEqual(len(self.mets.find_all_files(mimetype='image/tiff')), 13, '13 image/tiff')
        self.assertEqual(len(self.mets.find_all_files(mimetype='//application/.*')), 22, '22 application/.*')
        self.assertEqual(len(self.mets.find_all_files(mimetype=MIMETYPE_PAGE)), 20, '20 ' + MIMETYPE_PAGE)
        self.assertEqual(len(self.mets.find_all_files(url='OCR-D-IMG/FILE_0005_IMAGE.tif')), 1, '1 xlink:href="OCR-D-IMG/FILE_0005_IMAGE.tif"')

    def test_find_all_files_no_regex_for_pageid(self):
        with self.assertRaisesRegex(Exception, "not support regex search for pageId"):
            self.mets.find_all_files(pageId='//foo')

    def test_find_all_files_local_only(self):
        self.assertEqual(len(self.mets.find_all_files(pageId='PHYS_0001', local_only=True)), 3, '3 local files for page "PHYS_0001"')

    def test_physical_pages(self):
        self.assertEqual(len(self.mets.physical_pages), 3, '3 physical pages')

    def test_physical_pages_from_empty_mets(self):
        mets = OcrdMets(content="<mets></mets>")
        self.assertEqual(len(mets.physical_pages), 0, 'no physical page')
        mets.add_file('OUTPUT', ID="foo123", pageId="foobar")
        self.assertEqual(len(mets.physical_pages), 1, '1 physical page')

    def test_physical_pages_for_fileids(self):
        with copy_of_directory(assets.path_to('SBB0000F29300010000/data')) as tempdir:
            mets = OcrdMets(filename=join(tempdir, 'mets.xml'))
            self.assertEqual(mets.get_physical_pages(for_fileIds=['FILE_0002_IMAGE']), ['PHYS_0002'])

    def test_add_group(self):
        mets = OcrdMets.empty_mets()
        self.assertEqual(len(mets.file_groups), 0, '0 file groups')
        mets.add_file_group('TEST')
        self.assertEqual(len(mets.file_groups), 1, '1 file groups')
        mets.add_file_group('TEST')
        self.assertEqual(len(mets.file_groups), 1, '1 file groups')

    def test_add_file(self):
        mets = OcrdMets.empty_mets()
        self.assertEqual(len(mets.file_groups), 0, '0 file groups')
        self.assertEqual(len(list(mets.find_all_files(fileGrp='OUTPUT'))), 0, '0 files in "OUTPUT"')
        f = mets.add_file('OUTPUT', ID="foo123", mimetype="bla/quux", pageId="foobar")
        f2 = mets.add_file('OUTPUT', ID="foo1232", mimetype="bla/quux", pageId="foobar")
        self.assertEqual(f.pageId, 'foobar', 'pageId set')
        self.assertEqual(len(mets.file_groups), 1, '1 file groups')
        self.assertEqual(len(list(mets.find_all_files(fileGrp='OUTPUT'))), 2, '2 files in "OUTPUT"')
        mets.set_physical_page_for_file('barfoo', f, order='300', orderlabel="page 300")
        self.assertEqual(f.pageId, 'barfoo', 'pageId changed')
        mets.set_physical_page_for_file('quux', f2, order='302', orderlabel="page 302")
        self.assertEqual(f2.pageId, 'quux', 'pageId changed')
        mets.set_physical_page_for_file('barfoo', f2, order='301', orderlabel="page 301")
        self.assertEqual(f2.pageId, 'barfoo', 'pageId changed')
        self.assertEqual(len(mets.file_groups), 1, '1 file group')

    def test_add_file_ID_already_exists(self):
        f = self.mets.add_file('OUTPUT', ID='best-id-ever', mimetype="beep/boop")
        self.assertEqual(f.ID, 'best-id-ever', "ID kept")
        with self.assertRaisesRegex(Exception, "File with ID='best-id-ever' already exists"):
            self.mets.add_file('OUTPUT', ID='best-id-ever', mimetype="boop/beep")
        f2 = self.mets.add_file('OUTPUT', ID='best-id-ever', mimetype="boop/beep", force=True)
        self.assertEqual(f._el, f2._el)

    def test_add_file_ignore(self):
        self.mets.add_file('OUTPUT', ID='best-id-ever', mimetype="beep/boop")
        self.mets.add_file('OUTPUT', ID='best-id-ever', mimetype="boop/beep", ignore=True)

    def test_add_file_ID_invalid(self):
        with self.assertRaisesRegex(Exception, "Invalid syntax for mets:file/@ID 1234:::"):
            self.mets.add_file('OUTPUT', ID='1234:::', mimetype="beep/boop")

    def test_filegrp_from_file(self):
        f = self.mets.find_all_files(fileGrp='OCR-D-IMG')[0]
        self.assertEqual(f.fileGrp, 'OCR-D-IMG')

    def test_add_file_no_id(self):
        with self.assertRaisesRegex(Exception, "Must set ID of the mets:file"):
            self.mets.add_file('FOO')

    def test_add_file_no_pageid(self):
        f = self.mets.add_file('OUTPUT', mimetype="bla/quux", ID="foo3")
        self.assertEqual(f.pageId, None, 'No pageId')

    def test_file_pageid(self):
        f = self.mets.find_all_files()[0]
        self.assertEqual(f.pageId, 'PHYS_0001')
        f.pageId = 'foo'
        self.assertEqual(f.pageId, 'foo')

    def test_agent(self):
        #  Processor(workspace=self.workspace)
        mets = self.mets
        beforelen = len(mets.agents)
        mets.add_agent('foo bar v0.0.1', 'OTHER', 'OTHER', 'YETOTHERSTILL')
        #  print(['%s'%x for x in mets.agents])
        self.assertEqual(len(mets.agents), beforelen + 1)

    def test_metshdr(self):
        """
        Test whether metsHdr is created on-demand
        """
        mets = OcrdMets(content="<mets></mets>")
        self.assertFalse(mets._tree.getroot().getchildren())
        mets.add_agent()
        self.assertEqual(len(mets._tree.getroot().getchildren()), 1)

    def test_nocontent_nofilename(self):
        with self.assertRaisesRegex(Exception, "Must pass 'filename' or 'content' to"):
            OcrdMets()

    def test_encoding_entities(self):
        mets = OcrdMets(content="""
        <mets>
          <metsHdr>
            <agent>
              <name>Őh śéé Áŕ</name>
              <note>OCR-D</note>
            </agent>
          </metsHdr>
        </mets>
        """)
        self.assertIn('Őh śéé Áŕ', mets.to_xml().decode('utf-8'))

    def test_remove_page(self):
        with copy_of_directory(assets.path_to('SBB0000F29300010000/data')) as tempdir:
            mets = OcrdMets(filename=join(tempdir, 'mets.xml'))
            self.assertEqual(mets.physical_pages, ['PHYS_0001', 'PHYS_0002', 'PHYS_0005'])
            mets.remove_physical_page('PHYS_0001')
            self.assertEqual(mets.physical_pages, ['PHYS_0002', 'PHYS_0005'])

    def test_remove_page_after_remove_file(self):
        with copy_of_directory(assets.path_to('SBB0000F29300010000/data')) as tempdir:
            mets = OcrdMets(filename=join(tempdir, 'mets.xml'))
            self.assertEqual(mets.physical_pages, ['PHYS_0001', 'PHYS_0002', 'PHYS_0005'])
            mets.remove_one_file('FILE_0005_IMAGE')
            self.assertEqual(mets.physical_pages, ['PHYS_0001', 'PHYS_0002'])

    def test_remove_file_ocrdfile(self):
        with copy_of_directory(assets.path_to('SBB0000F29300010000/data')) as tempdir:
            mets = OcrdMets(filename=join(tempdir, 'mets.xml'))
            self.assertEqual(mets.physical_pages, ['PHYS_0001', 'PHYS_0002', 'PHYS_0005'])
            ocrd_file = mets.find_all_files(ID='FILE_0005_IMAGE')[0]
            mets.remove_one_file(ocrd_file)
            self.assertEqual(mets.physical_pages, ['PHYS_0001', 'PHYS_0002'])

    def test_remove_file_regex(self):
        with copy_of_directory(assets.path_to('SBB0000F29300010000/data')) as tempdir:
            mets = OcrdMets(filename=join(tempdir, 'mets.xml'))
            self.assertEqual(mets.physical_pages, ['PHYS_0001', 'PHYS_0002', 'PHYS_0005'])
            mets.remove_file('//FILE_0005.*')
            self.assertEqual(mets.physical_pages, ['PHYS_0001', 'PHYS_0002'])

    def test_remove_file_group0(self):
        """
        Test removal of filegrp
        """
        with copy_of_directory(assets.path_to('SBB0000F29300010000/data')) as tempdir:
            mets = OcrdMets(filename=join(tempdir, 'mets.xml'))
            self.assertEqual(len(mets.file_groups), 17)
            self.assertEqual(len(mets.find_all_files()), 35)
            #  print()
            #  before = sorted([x.ID for x in mets.find_all_files()])
            with self.assertRaisesRegex(Exception, "not empty"):
                mets.remove_file_group('OCR-D-GT-ALTO')
            mets.remove_file_group('OCR-D-GT-PAGE', recursive=True)
            #  print([x for x in before if x not in sorted([x.ID for x in mets.find_all_files()])])
            self.assertEqual(len(mets.file_groups), 16)
            self.assertEqual(len(mets.find_all_files()), 33)

    def test_remove_file_group_regex(self):
        """
        Test removal of filegrp
        """
        with copy_of_directory(assets.path_to('SBB0000F29300010000/data')) as tempdir:
            mets = OcrdMets(filename=join(tempdir, 'mets.xml'))
            self.assertEqual(len(mets.file_groups), 17)
            self.assertEqual(len(mets.find_all_files()), 35)
            mets.remove_file_group('//OCR-D-GT-.*', recursive=True)
            self.assertEqual(len(mets.file_groups), 15)
            self.assertEqual(len(mets.find_all_files()), 31)