예제 #1
0
 def test_merge(self):
     with copy_of_directory(assets.path_to('kant_aufklaerung_1784/data')) as ws1dir, \
         copy_of_directory(assets.path_to('SBB0000F29300010000/data')) as ws2dir:
         ws1 = Workspace(self.resolver, ws1dir)
         ws2 = Workspace(self.resolver, ws2dir)
         assert len(ws1.mets.find_all_files()) == 6
         ws1.merge(ws2)
         assert len(ws1.mets.find_all_files()) == 41
         assert exists(join(ws1dir, 'OCR-D-IMG/FILE_0001_IMAGE.tif'))
def test_workspace_remove_groups_unforce(workspace_directory):
    """Remove groups by pattern recursive"""

    # arrange
    original_data = ET.parse(os.path.join(workspace_directory,
                                          'mets.xml')).getroot()
    alto_groups = original_data.findall(
        './/{http://www.loc.gov/METS/}fileGrp[@USE="OCR-D-GT-ALTO"]')
    assert len(alto_groups) == 1
    altos = alto_groups[0].findall('.//{http://www.loc.gov/METS/}file')
    assert len(altos) == 2

    # act
    resolver = Resolver()
    workspace = Workspace(resolver, workspace_directory)
    workspace.remove_file_group('//OCR-D-GT.*', recursive=True)
    workspace.save_mets()

    # assert
    written_data = ET.parse(os.path.join(workspace_directory,
                                         'mets.xml')).getroot()
    assert written_data is not None
    groups_new = written_data.findall(
        './/{http://www.loc.gov/METS/}fileGrp[@USE="OCR-D-GT-ALTO"]')
    assert not groups_new
예제 #3
0
def test_workspace_init_missing_mets():
    """Raise Exception when missing mets-file in workspace"""

    with pytest.raises(Exception) as exc:
        Workspace(Resolver(), "foo/bar")

    assert "File does not exist" in str(exc.value)
예제 #4
0
 def test_remove_file_group_rmdir(self):
     with copy_of_directory(
             assets.path_to('SBB0000F29300010000/data')) as tempdir:
         workspace = Workspace(self.resolver, directory=tempdir)
         self.assertTrue(exists(join(tempdir, 'OCR-D-IMG')))
         workspace.remove_file_group('OCR-D-IMG', recursive=True)
         self.assertFalse(exists(join(tempdir, 'OCR-D-IMG')))
예제 #5
0
    def workspace_from_folder(self, directory, return_mets=False, clobber_mets=False, convention='ocrd-gt'):
        """
        Create a workspace from a folder, creating a METS file.

        Args:
            convention: See add_files_to_mets
            clobber_mets (boolean) : Whether to overwrite existing mets.xml. Default: False.
            return_mets (boolean) : Do not create the actual mets.xml file but return the :class:`OcrdMets`. Default: False.
        """
        if directory is None:
            raise Exception("Must pass directory")
        if not os.path.isdir(directory):
            raise Exception("Directory does not exist or is not a directory: '%s'" % directory)
        if not clobber_mets and os.path.exists(os.path.join(directory, 'mets.xml')):
            raise Exception("Not clobbering existing mets.xml in '%s'." % directory)

        mets = OcrdMets(content=METS_XML_EMPTY)

        if not os.path.exists(directory):
            os.makedirs(directory)
        directory = os.path.abspath(directory)

        self.add_files_to_mets(convention, mets, directory)
        if return_mets:
            return mets

        #  print(mets.to_xml(xmllint=True).decode('utf-8'))
        mets_fpath = os.path.join(directory, 'mets.xml')
        with open(mets_fpath, 'wb') as fmets:
            log.info("Writing %s", mets_fpath)
            fmets.write(mets.to_xml(xmllint=True))

        return Workspace(self, directory, mets)
예제 #6
0
    def test_superfluous_copies_in_ws_dir(self):
        """
        https://github.com/OCR-D/core/issues/227
        """
        def find_recursive(root):
            ret = []
            for _, _, f in walk(root):
                for file in f:
                    ret.append(file)
            return ret

        with TemporaryDirectory() as wsdir:
            with open(
                    assets.path_to(
                        'SBB0000F29300010000/data/mets_one_file.xml'),
                    'r') as f_in:
                with open(join(wsdir, 'mets.xml'), 'w') as f_out:
                    f_out.write(f_in.read())
            self.assertEqual(len(find_recursive(wsdir)), 1)
            ws1 = Workspace(self.resolver, wsdir)
            for file in ws1.mets.find_files():
                ws1.download_file(file)
            self.assertEqual(len(find_recursive(wsdir)), 2)
            self.assertTrue(
                exists(join(wsdir, 'OCR-D-IMG/FILE_0005_IMAGE.tif')))
예제 #7
0
    def workspace_from_nothing(self,
                               directory,
                               mets_basename='mets.xml',
                               clobber_mets=False):
        """
        Create an empty workspace.

        Arguments:
            directory (string): Target directory for the workspace. \
                If ``none``, create a temporary directory under :py:data:`ocrd.constants.TMP_PREFIX`. \
                (The resulting path can be retrieved via :py:attr:`ocrd.Workspace.directory`.)
        Keyword Arguments:
            clobber_mets (boolean, False): Whether to overwrite existing ``mets.xml``. \
                By default existing ``mets.xml`` will raise an exception.

        Returns:
            a new :py:class:`~ocrd.workspace.Workspace`
        """
        log = getLogger('ocrd.resolver.workspace_from_nothing')
        if directory is None:
            directory = mkdtemp(prefix=TMP_PREFIX)
        Path(directory).mkdir(parents=True, exist_ok=True)
        mets_path = Path(directory, mets_basename)
        if mets_path.exists() and not clobber_mets:
            raise FileExistsError(
                "METS '%s' already exists in '%s' and clobber_mets not set." %
                (mets_basename, directory))
        mets = OcrdMets.empty_mets()
        log.info("Writing METS to %s", mets_path)
        mets_path.write_bytes(mets.to_xml(xmllint=True))

        return Workspace(self, directory, mets, mets_basename=mets_basename)
예제 #8
0
 def test_remove_file_page_recursive_same_group(self):
     with copy_of_directory(assets.path_to('kant_aufklaerung_1784-complex/data')) as tempdir:
         with pushd_popd(tempdir):
             ws = Workspace(self.resolver, directory=tempdir)
             before = count_files()
             ws.remove_file('OCR-D-IMG-BINPAGE-sauvola_0001', page_recursive=True, page_same_group=True, force=False)
             after = count_files()
             self.assertEqual(after, before - 1, '2 file deleted')
예제 #9
0
 def test_remove_file_page_recursive(self):
     with copy_of_directory(assets.path_to('kant_aufklaerung_1784-complex/data')) as tempdir:
         with pushd_popd(tempdir):
             ws = Workspace(self.resolver, directory=tempdir)
             self.assertEqual(len(ws.mets.find_files()), 119)
             ws.remove_file('OCR-D-OCR-OCRO-fraktur-SEG-LINE-tesseract-ocropy-DEWARP_0001', page_recursive=True, page_same_group=False, keep_file=True)
             self.assertEqual(len(ws.mets.find_files()), 83)
             ws.remove_file('PAGE_0017_ALTO', page_recursive=True)
예제 #10
0
파일: resolver.py 프로젝트: mjenckel/core
    def workspace_from_url(self, mets_url, dst_dir=None, clobber_mets=False, mets_basename=None, download=False, src_baseurl=None):
        """
        Create a workspace from a METS by URL (i.e. clone it).

        Sets the mets.xml file

        Arguments:
            mets_url (string): Source mets URL
            dst_dir (string, None): Target directory for the workspace
            clobber_mets (boolean, False): Whether to overwrite existing mets.xml. By default existing mets.xml will raise an exception.
            download (boolean, False): Whether to download all the files
            src_baseurl (string, None): Base URL for resolving relative file locations

        Returns:
            Workspace
        """

        if mets_url is None:
            raise ValueError("Must pass 'mets_url' workspace_from_url")

        # if mets_url is a relative filename, make it absolute
        if is_local_filename(mets_url) and not Path(mets_url).is_absolute():
            mets_url = str(Path(Path.cwd() / mets_url))

        # if mets_basename is not given, use the last URL segment of the mets_url
        if mets_basename is None:
            mets_basename = nth_url_segment(mets_url, -1)

        # If src_baseurl wasn't given, determine from mets_url by removing last url segment
        if not src_baseurl:
            last_segment = nth_url_segment(mets_url)
            src_baseurl = remove_non_path_from_url(remove_non_path_from_url(mets_url)[:-len(last_segment)])

        # resolve dst_dir
        if not dst_dir:
            if is_local_filename(mets_url):
                log.debug("Deriving dst_dir %s from %s", Path(mets_url).parent, mets_url)
                dst_dir = Path(mets_url).parent
            else:
                log.debug("Creating ephemeral workspace '%s' for METS @ <%s>", dst_dir, mets_url)
                dst_dir = tempfile.mkdtemp(prefix=TMP_PREFIX)
        # XXX Path.resolve is always strict in Python <= 3.5, so create dst_dir unless it exists consistently
        if not Path(dst_dir).exists():
            Path(dst_dir).mkdir(parents=True, exist_ok=False)
        dst_dir = str(Path(dst_dir).resolve())

        log.debug("workspace_from_url\nmets_basename='%s'\nmets_url='%s'\nsrc_baseurl='%s'\ndst_dir='%s'",
            mets_basename, mets_url, src_baseurl, dst_dir)

        self.download_to_directory(dst_dir, mets_url, basename=mets_basename, if_exists='overwrite' if clobber_mets else 'raise')

        workspace = Workspace(self, dst_dir, mets_basename=mets_basename, baseurl=src_baseurl)

        if download:
            for f in workspace.mets.find_files():
                workspace.download_file(f)

        return workspace
예제 #11
0
def test_workspace_remove_group_not_found(workspace_directory):
    """Group identified by name not found raises exception"""

    resolver = Resolver()
    workspace = Workspace(resolver, workspace_directory)
    with pytest.raises(Exception) as exc:
        workspace.remove_file_group('FOO-BAR')

    assert "No such fileGrp" in str(exc)
예제 #12
0
    def workspace_from_url(self, mets_url, directory=None, clobber_mets=False, mets_basename=None, download=False, download_local=False):
        """
        Create a workspace from a METS by URL.

        Sets the mets.xml file
        """
        if directory is not None and not directory.startswith('/'):
            directory = os.path.abspath(directory)

        if mets_url is None:
            if directory is None:
                raise Exception("Must pass mets_url and/or directory to workspace_from_url")
            else:
                mets_url = 'file://%s/%s' % (directory, mets_basename)
        if mets_url.find('://') == -1:
            # resolve to absolute
            mets_url = os.path.abspath(mets_url)
            mets_url = 'file://' + mets_url
        if directory is None:
            # if mets_url is a file-url assume working directory to be  where
            # the mets.xml resides
            if mets_url.startswith('file://'):
                # if directory was not given and mets_url is a file assume that
                # directory should be the directory where the mets.xml resides
                directory = os.path.dirname(mets_url[len('file://'):])
            else:
                directory = tempfile.mkdtemp(prefix=TMP_PREFIX)
                log.debug("Creating workspace '%s' for METS @ <%s>", directory, mets_url)

        # if mets_basename is not given, use the last URL segment of the mets_url
        if mets_basename is None:
            mets_basename = mets_url \
                .rsplit('/', 1)[-1] \
                .split('?')[0] \
                .split('#')[0]

        mets_fpath = os.path.join(directory, mets_basename)
        log.debug("Copying mets url '%s' to '%s'", mets_url, mets_fpath)
        if 'file://' + mets_fpath == mets_url:
            log.debug("Target and source mets are identical")
        else:
            if os.path.exists(mets_fpath) and not clobber_mets:
                raise Exception("File '%s' already exists but clobber_mets is false" % mets_fpath)
            else:
                self.download_to_directory(directory, mets_url, basename=mets_basename)

        workspace = Workspace(self, directory, mets_basename=mets_basename)

        if download_local or download:
            for file_grp in workspace.mets.file_groups:
                if download_local:
                    for f in workspace.mets.find_files(fileGrp=file_grp, local_only=True):
                        workspace.download_file(f, subdir=file_grp)
                else:
                    workspace.download_files_in_group(file_grp)

        return workspace
예제 #13
0
 def setUp(self):
     self.resolver = Resolver()
     self.bagger = WorkspaceBagger(self.resolver)
     self.tempdir = mkdtemp()
     self.bagdir = join(self.tempdir, 'kant_aufklaerung_1784')
     copytree(assets.path_to('kant_aufklaerung_1784'), self.bagdir)
     self.workspace_dir = join(self.bagdir, 'data')
     self.workspace = Workspace(self.resolver,
                                directory=join(self.workspace_dir))
예제 #14
0
 def test_remove_file_force(self):
     with copy_of_directory(
             assets.path_to('SBB0000F29300010000/data')) as tempdir:
         workspace = Workspace(self.resolver, directory=tempdir)
         with self.assertRaisesRegex(FileNotFoundError, "not found"):
             # should fail
             workspace.remove_file('non-existing-id')
         # should succeed
         workspace.remove_file('non-existing-id', force=True)
예제 #15
0
 def test_remove_file_group_force(self):
     with copy_of_directory(
             assets.path_to('SBB0000F29300010000/data')) as tempdir:
         workspace = Workspace(self.resolver, directory=tempdir)
         with self.assertRaisesRegex(Exception, "No such fileGrp"):
             # raise error unless force
             workspace.remove_file_group('I DO NOT EXIST')
         # no error
         workspace.remove_file_group('I DO NOT EXIST', force=True)
예제 #16
0
 def test_remove_file_group_force(self):
     with copy_of_directory(assets.path_to('SBB0000F29300010000/data')) as tempdir:
         workspace = Workspace(self.resolver, directory=tempdir)
         with self.assertRaisesRegex(Exception, "No such fileGrp"):
             # should fail
             workspace.remove_file_group('I DO NOT EXIST')
         # should succeed
         workspace.remove_file_group('I DO NOT EXIST', force=True)
         # should also succeed
         workspace.overwrite_mode = True
         workspace.remove_file_group('I DO NOT EXIST', force=False)
예제 #17
0
 def setUp(self):
     super().setUp()
     pass
     if exists(BACKUPDIR):
         rmtree(BACKUPDIR)
     self.resolver = Resolver()
     self.bagger = WorkspaceBagger(self.resolver)
     self.tempdir = mkdtemp()
     self.bagdir = join(self.tempdir, 'bag')
     copytree(assets.path_to('kant_aufklaerung_1784'), self.bagdir)
     self.workspace_dir = join(self.bagdir, 'data')
     self.workspace = Workspace(self.resolver, directory=self.workspace_dir)
예제 #18
0
def test_merge(tmp_path):

    # arrange
    dst_path1 = tmp_path / 'kant_aufklaerung'
    dst_path1.mkdir()
    dst_path2 = tmp_path / 'sbb'
    dst_path2.mkdir()
    copytree(assets.path_to('kant_aufklaerung_1784/data'), dst_path1)
    copytree(assets.path_to('SBB0000F29300010000/data'), dst_path2)

    ws1 = Workspace(Resolver(), dst_path1)
    ws2 = Workspace(Resolver(), dst_path2)

    # assert number of files before
    assert len(ws1.mets.find_all_files()) == 6

    # act
    ws1.merge(ws2)

    # assert
    assert len(ws1.mets.find_all_files()) == 41
    assert exists(join(dst_path1, 'OCR-D-IMG/FILE_0001_IMAGE.tif'))
예제 #19
0
 def test_rename_file_group(self):
     with copy_of_directory(assets.path_to('kant_aufklaerung_1784-page-region-line-word_glyph/data')) as tempdir:
         workspace = Workspace(self.resolver, directory=tempdir)
         with pushd_popd(tempdir):
             pcgts_before = page_from_file(next(workspace.mets.find_files(ID='OCR-D-GT-SEG-WORD_0001')))
             assert pcgts_before.get_Page().imageFilename == 'OCR-D-IMG/OCR-D-IMG_0001.tif'
             # from os import system
             # print(system('find'))
             workspace.rename_file_group('OCR-D-IMG', 'FOOBAR')
             # print(system('find'))
             pcgts_after = page_from_file(next(workspace.mets.find_files(ID='OCR-D-GT-SEG-WORD_0001')))
             assert pcgts_after.get_Page().imageFilename == 'FOOBAR/OCR-D-IMG_0001.tif'
             assert Path('FOOBAR/OCR-D-IMG_0001.tif').exists()
             assert not Path('OCR-D-IMG/OCR-D-IMG_0001.tif').exists()
예제 #20
0
def test_workspace_remove_single_group_recursive(workspace_directory):
    """Remove single group recursive by name succeeds"""

    # arrange
    resolver = Resolver()
    workspace = Workspace(resolver, workspace_directory)
    files = workspace.mets.find_files(fileGrp='OCR-D-GT-ALTO')
    assert len(files) == 2

    # act
    workspace.remove_file_group('OCR-D-GT-ALTO', recursive=True)

    # assert
    files = workspace.mets.find_files(fileGrp='OCR-D-GT-ALTO')
    assert len(files) == 0
예제 #21
0
파일: resolver.py 프로젝트: mjenckel/core
    def workspace_from_nothing(self, directory, mets_basename='mets.xml', clobber_mets=False):
        """
        Create an empty workspace.
        """
        if directory is None:
            directory = tempfile.mkdtemp(prefix=TMP_PREFIX)
        Path(directory).mkdir(parents=True, exist_ok=True)
        mets_path = Path(directory, mets_basename)
        if mets_path.exists() and not clobber_mets:
            raise FileExistsError("METS '%s' already exists in '%s' and clobber_mets not set." % (mets_basename, directory))
        mets = OcrdMets.empty_mets()
        log.info("Writing METS to %s", mets_path)
        mets_path.write_bytes(mets.to_xml(xmllint=True))

        return Workspace(self, directory, mets)
예제 #22
0
    def workspace_from_nothing(self, directory, mets_basename='mets.xml', clobber_mets=False):
        """
        Create an empty workspace.
        """
        if directory is None:
            directory = tempfile.mkdtemp(prefix=TMP_PREFIX)
        if not os.path.exists(directory):
            os.makedirs(directory)

        mets_fpath = os.path.join(directory, mets_basename)
        if not clobber_mets and os.path.exists(mets_fpath):
            raise Exception("Not clobbering existing mets.xml in '%s'." % directory)
        mets = OcrdMets.empty_mets()
        with open(mets_fpath, 'wb') as fmets:
            log.info("Writing %s", mets_fpath)
            fmets.write(mets.to_xml(xmllint=True))

        return Workspace(self, directory, mets)
예제 #23
0
    def workspace_from_url(self, mets_url, directory=None):
        """
        Create a workspace from a METS by URL.

        Sets the mets.xml file
        """
        if mets_url is None:
            raise Exception("Must pass mets_url to workspace_from_url")
        if mets_url.find('://') == -1:
            mets_url = 'file://' + mets_url
        if directory is None:
            directory = tempfile.mkdtemp(prefix=TMP_PREFIX)
        log.debug("Creating workspace '%s' for METS @ <%s>", directory,
                  mets_url)
        self.download_to_directory(directory,
                                   mets_url,
                                   basename='mets.xml',
                                   prefer_symlink=False)
        return Workspace(self, directory)
예제 #24
0
    def unpack_workspace_from_filename(self, zip_filename, directory=None):
        """

        :TODO:
        Unpack an OCRD-ZIP to a local workspace.

        1. Create directory
        3. Unpack zipfile into it
        4. Initiate workspace

        Args:
            zip_filename (string) : Path to OCRD-ZIP file
        """
        if directory is None:
            directory = tempfile.mkdtemp(prefix=TMP_PREFIX)
        log.debug("Unpacking to %s", directory)
        with ZipFile(zip_filename, 'r') as z:
            z.extractall(path=directory)
        return Workspace(self, directory)
예제 #25
0
def test_superfluous_copies_in_ws_dir(tmp_path):
    """
    https://github.com/OCR-D/core/issues/227
    """
    # arrange
    src_path = assets.path_to('SBB0000F29300010000/data/mets_one_file.xml')
    dst_path = join(tmp_path, 'mets.xml')
    copyfile(src_path, dst_path)
    ws1 = Workspace(Resolver(), tmp_path)

    # assert directory files
    assert count_files(tmp_path) == 1

    # act
    for file in ws1.mets.find_all_files():
        ws1.download_file(file)

    # assert
    assert count_files(tmp_path) == 2
    assert exists(join(tmp_path, 'OCR-D-IMG/FILE_0005_IMAGE.tif'))
예제 #26
0
    def test_227_1(self):
        def find_recursive(root):
            ret = []
            for _, _, f in os.walk(root):
                for file in f:
                    ret.append(file)
            return ret

        with TemporaryDirectory() as wsdir:
            with open(
                    assets.path_to(
                        'SBB0000F29300010000/data/mets_one_file.xml'),
                    'r') as f_in:
                with open(join(wsdir, 'mets.xml'), 'w') as f_out:
                    f_out.write(f_in.read())
            self.assertEqual(len(find_recursive(wsdir)), 1)
            ws1 = Workspace(self.resolver, wsdir)
            for file in ws1.mets.find_files():
                ws1.download_file(file)
            self.assertEqual(len(find_recursive(wsdir)), 2)
            self.assertTrue(exists(join(wsdir, 'OCR-D-IMG/FILE_0005_IMAGE')))
예제 #27
0
def test_rename_file_group(tmp_path):
    # arrange
    copytree(
        assets.path_to(
            'kant_aufklaerung_1784-page-region-line-word_glyph/data'),
        str(tmp_path))
    workspace = Workspace(Resolver(), directory=str(tmp_path))

    # before act
    # TODO clear semantics
    # requires rather odd additional path-setting because root path from
    # workspace is not propagated - works only if called inside workspace
    # which can be achieved with pushd_popd functionalities
    ocrd_file = next(workspace.mets.find_files(ID='OCR-D-GT-SEG-WORD_0001'))
    relative_name = ocrd_file.local_filename
    ocrd_file.local_filename = join(tmp_path, relative_name)
    pcgts_before = page_from_file(ocrd_file)
    # before assert
    assert pcgts_before.get_Page(
    ).imageFilename == 'OCR-D-IMG/OCR-D-IMG_0001.tif'

    # act
    workspace.rename_file_group('OCR-D-IMG', 'FOOBAR')
    next_ocrd_file = next(
        workspace.mets.find_files(ID='OCR-D-GT-SEG-WORD_0001'))
    next_ocrd_file.local_filename = join(tmp_path, relative_name)
    pcgts_after = page_from_file(next_ocrd_file)

    # assert
    assert pcgts_after.get_Page().imageFilename == 'FOOBAR/FOOBAR_0001.tif'
    assert Path(tmp_path / 'FOOBAR/FOOBAR_0001.tif').exists()
    assert not Path('OCR-D-IMG/OCR-D-IMG_0001.tif').exists()
    assert workspace.mets.get_physical_pages(
        for_fileIds=['OCR-D-IMG_0001']) == [None]
    assert workspace.mets.get_physical_pages(for_fileIds=['FOOBAR_0001']) == [
        'phys_0001'
    ]
예제 #28
0
def _fixture_kant_complex(tmp_path):
    copytree(assets.path_to('kant_aufklaerung_1784-complex/data'),
             str(tmp_path))
    yield Workspace(Resolver, directory=tmp_path)
예제 #29
0
    def workspace_from_url(self,
                           mets_url,
                           dst_dir=None,
                           clobber_mets=False,
                           mets_basename=None,
                           download=False,
                           baseurl=None):
        """
        Create a workspace from a METS by URL.

        Sets the mets.xml file

        Arguments:
            mets_url (string): Source mets URL
            dst_dir (string, None): Target directory for the workspace
            clobber_mets (boolean, False): Whether to overwrite existing mets.xml. By default existing mets.xml will raise an exception.
            download (boolean, False): Whether to download all the files
            baseurl (string, None): Base URL for resolving relative file locations

        Returns:
            Workspace
        """
        if dst_dir and not dst_dir.startswith('/'):
            dst_dir = abspath(dst_dir)

        if mets_url is None:
            if baseurl is None:
                raise Exception(
                    "Must pass mets_url and/or baseurl to workspace_from_url")
            else:
                mets_url = 'file://%s/%s' % (baseurl, mets_basename
                                             if mets_basename else 'mets.xml')
        if baseurl is None:
            baseurl = mets_url.rsplit('/', 1)[0]
        log.debug(
            "workspace_from_url\nmets_url='%s'\nbaseurl='%s'\ndst_dir='%s'",
            mets_url, baseurl, dst_dir)

        # resolve to absolute
        if '://' not in mets_url:
            mets_url = 'file://%s' % abspath(mets_url)

        if dst_dir is None:
            # if mets_url is a file-url assume working directory is source directory
            if mets_url.startswith('file://'):
                # if dst_dir was not given and mets_url is a file assume that
                # dst_dir should be the directory where the mets.xml resides
                dst_dir = dirname(mets_url[len('file://'):])
            else:
                dst_dir = tempfile.mkdtemp(prefix=TMP_PREFIX)
                log.debug("Creating workspace '%s' for METS @ <%s>", dst_dir,
                          mets_url)

        # if mets_basename is not given, use the last URL segment of the mets_url
        if mets_basename is None:
            mets_basename = mets_url \
                .rsplit('/', 1)[-1] \
                .split('?')[0] \
                .split('#')[0]

        dst_mets = join(dst_dir, mets_basename)
        log.debug("Copying mets url '%s' to '%s'", mets_url, dst_mets)
        if 'file://' + dst_mets == mets_url:
            log.debug("Target and source mets are identical")
        else:
            if exists(dst_mets) and not clobber_mets:
                raise Exception(
                    "File '%s' already exists but clobber_mets is false" %
                    dst_mets)
            else:
                self.download_to_directory(dst_dir,
                                           mets_url,
                                           basename=mets_basename)

        workspace = Workspace(self,
                              dst_dir,
                              mets_basename=mets_basename,
                              baseurl=baseurl)

        if download:
            for f in workspace.mets.find_files():
                workspace.download_file(f)

        return workspace
예제 #30
0
    def workspace_from_url(self,
                           mets_url,
                           dst_dir=None,
                           clobber_mets=False,
                           mets_basename=None,
                           download=False,
                           src_baseurl=None):
        """
        Create a workspace from a METS by URL (i.e. clone if :py:attr:`mets_url` is remote or :py:attr:`dst_dir` is given).

        Arguments:
            mets_url (string): Source METS URL or filesystem path
        Keyword Arguments:
            dst_dir (string, None): Target directory for the workspace. \
                By default create a temporary directory under :py:data:`ocrd.constants.TMP_PREFIX`. \
                (The resulting path can be retrieved via :py:attr:`ocrd.Workspace.directory`.)
            clobber_mets (boolean, False): Whether to overwrite existing ``mets.xml``. \
                By default existing ``mets.xml`` will raise an exception.
            download (boolean, False): Whether to also download all the files referenced by the METS
            src_baseurl (string, None): Base URL for resolving relative file locations

        Download (clone) :py:attr:`mets_url` to ``mets.xml`` in :py:attr:`dst_dir`, unless 
        the former is already local and the latter is ``none`` or already identical to its directory name.

        Returns:
            a new :py:class:`~ocrd.workspace.Workspace`
        """
        log = getLogger('ocrd.resolver.workspace_from_url')

        if mets_url is None:
            raise ValueError("Must pass 'mets_url' workspace_from_url")

        # if mets_url is a relative filename, make it absolute
        if is_local_filename(mets_url) and not Path(mets_url).is_absolute():
            mets_url = str(Path(Path.cwd() / mets_url))

        # if mets_basename is not given, use the last URL segment of the mets_url
        if mets_basename is None:
            mets_basename = nth_url_segment(mets_url, -1)

        # If src_baseurl wasn't given, determine from mets_url by removing last url segment
        if not src_baseurl:
            last_segment = nth_url_segment(mets_url)
            src_baseurl = remove_non_path_from_url(
                remove_non_path_from_url(mets_url)[:-len(last_segment)])

        # resolve dst_dir
        if not dst_dir:
            if is_local_filename(mets_url):
                log.debug("Deriving dst_dir %s from %s",
                          Path(mets_url).parent, mets_url)
                dst_dir = Path(mets_url).parent
            else:
                log.debug("Creating ephemeral workspace '%s' for METS @ <%s>",
                          dst_dir, mets_url)
                dst_dir = mkdtemp(prefix=TMP_PREFIX)
        # XXX Path.resolve is always strict in Python <= 3.5, so create dst_dir unless it exists consistently
        if not Path(dst_dir).exists():
            Path(dst_dir).mkdir(parents=True, exist_ok=False)
        dst_dir = str(Path(dst_dir).resolve())

        log.debug(
            "workspace_from_url\nmets_basename='%s'\nmets_url='%s'\nsrc_baseurl='%s'\ndst_dir='%s'",
            mets_basename, mets_url, src_baseurl, dst_dir)

        self.download_to_directory(
            dst_dir,
            mets_url,
            basename=mets_basename,
            if_exists='overwrite' if clobber_mets else 'skip')

        workspace = Workspace(self,
                              dst_dir,
                              mets_basename=mets_basename,
                              baseurl=src_baseurl)

        if download:
            for f in workspace.mets.find_files():
                workspace.download_file(f)

        return workspace