예제 #1
0
 def test_rename_file_group(self):
     with copy_of_directory(
             assets.path_to(
                 'kant_aufklaerung_1784-page-region-line-word_glyph/data')
     ) as tempdir:
         workspace = Workspace(self.resolver, directory=tempdir)
         with pushd_popd(tempdir):
             pcgts_before = page_from_file(
                 next(workspace.mets.find_files(
                     ID='OCR-D-GT-SEG-WORD_0001')))
             assert pcgts_before.get_Page(
             ).imageFilename == 'OCR-D-IMG/OCR-D-IMG_0001.tif'
             # from os import system
             # print(system('find'))
             workspace.rename_file_group('OCR-D-IMG', 'FOOBAR')
             # print(system('find'))
             pcgts_after = page_from_file(
                 next(workspace.mets.find_files(
                     ID='OCR-D-GT-SEG-WORD_0001')))
             assert pcgts_after.get_Page(
             ).imageFilename == 'FOOBAR/OCR-D-IMG_0001.tif'
             assert Path('FOOBAR/OCR-D-IMG_0001.tif').exists()
             assert not Path('OCR-D-IMG/OCR-D-IMG_0001.tif').exists()
예제 #2
0
 def test_remove_file_group_force(self):
     with copy_of_directory(
             assets.path_to('SBB0000F29300010000/data')) as tempdir:
         workspace = Workspace(self.resolver, directory=tempdir)
         with self.assertRaisesRegex(Exception, "No such fileGrp"):
             # raise error unless force
             workspace.remove_file_group('I DO NOT EXIST')
         # no error
         workspace.remove_file_group('I DO NOT EXIST', force=True)
예제 #3
0
 def test_remove_file_force(self):
     with copy_of_directory(
             assets.path_to('SBB0000F29300010000/data')) as tempdir:
         workspace = Workspace(self.resolver, directory=tempdir)
         with self.assertRaisesRegex(FileNotFoundError, "not found"):
             # should fail
             workspace.remove_file('non-existing-id')
         # should succeed
         workspace.remove_file('non-existing-id', force=True)
예제 #4
0
    def workspace_from_url(self, mets_url, directory=None, clobber_mets=False, mets_basename=None, download=False, download_local=False):
        """
        Create a workspace from a METS by URL.

        Sets the mets.xml file
        """
        if directory is not None and not directory.startswith('/'):
            directory = os.path.abspath(directory)

        if mets_url is None:
            if directory is None:
                raise Exception("Must pass mets_url and/or directory to workspace_from_url")
            else:
                mets_url = 'file://%s/%s' % (directory, mets_basename)
        if mets_url.find('://') == -1:
            # resolve to absolute
            mets_url = os.path.abspath(mets_url)
            mets_url = 'file://' + mets_url
        if directory is None:
            # if mets_url is a file-url assume working directory to be  where
            # the mets.xml resides
            if mets_url.startswith('file://'):
                # if directory was not given and mets_url is a file assume that
                # directory should be the directory where the mets.xml resides
                directory = os.path.dirname(mets_url[len('file://'):])
            else:
                directory = tempfile.mkdtemp(prefix=TMP_PREFIX)
                log.debug("Creating workspace '%s' for METS @ <%s>", directory, mets_url)

        # if mets_basename is not given, use the last URL segment of the mets_url
        if mets_basename is None:
            mets_basename = mets_url \
                .rsplit('/', 1)[-1] \
                .split('?')[0] \
                .split('#')[0]

        mets_fpath = os.path.join(directory, mets_basename)
        log.debug("Copying mets url '%s' to '%s'", mets_url, mets_fpath)
        if 'file://' + mets_fpath == mets_url:
            log.debug("Target and source mets are identical")
        else:
            if os.path.exists(mets_fpath) and not clobber_mets:
                raise Exception("File '%s' already exists but clobber_mets is false" % mets_fpath)
            else:
                self.download_to_directory(directory, mets_url, basename=mets_basename)

        workspace = Workspace(self, directory, mets_basename=mets_basename)

        if download_local or download:
            for file_grp in workspace.mets.file_groups:
                if download_local:
                    for f in workspace.mets.find_files(fileGrp=file_grp, local_only=True):
                        workspace.download_file(f, subdir=file_grp)
                else:
                    workspace.download_files_in_group(file_grp)

        return workspace
예제 #5
0
def test_rename_file_group(tmp_path):
    # arrange
    copytree(
        assets.path_to(
            'kant_aufklaerung_1784-page-region-line-word_glyph/data'),
        str(tmp_path))
    workspace = Workspace(Resolver(), directory=str(tmp_path))

    # before act
    # TODO clear semantics
    # requires rather odd additional path-setting because root path from
    # workspace is not propagated - works only if called inside workspace
    # which can be achieved with pushd_popd functionalities
    ocrd_file = next(workspace.mets.find_files(ID='OCR-D-GT-SEG-WORD_0001'))
    relative_name = ocrd_file.local_filename
    ocrd_file.local_filename = join(tmp_path, relative_name)
    pcgts_before = page_from_file(ocrd_file)
    # before assert
    assert pcgts_before.get_Page(
    ).imageFilename == 'OCR-D-IMG/OCR-D-IMG_0001.tif'

    # act
    workspace.rename_file_group('OCR-D-IMG', 'FOOBAR')
    next_ocrd_file = next(
        workspace.mets.find_files(ID='OCR-D-GT-SEG-WORD_0001'))
    next_ocrd_file.local_filename = join(tmp_path, relative_name)
    pcgts_after = page_from_file(next_ocrd_file)

    # assert
    assert pcgts_after.get_Page().imageFilename == 'FOOBAR/FOOBAR_0001.tif'
    assert Path(tmp_path / 'FOOBAR/FOOBAR_0001.tif').exists()
    assert not Path('OCR-D-IMG/OCR-D-IMG_0001.tif').exists()
    assert workspace.mets.get_physical_pages(
        for_fileIds=['OCR-D-IMG_0001']) == [None]
    assert workspace.mets.get_physical_pages(for_fileIds=['FOOBAR_0001']) == [
        'phys_0001'
    ]
예제 #6
0
파일: resolver.py 프로젝트: mjenckel/core
    def workspace_from_nothing(self, directory, mets_basename='mets.xml', clobber_mets=False):
        """
        Create an empty workspace.
        """
        if directory is None:
            directory = tempfile.mkdtemp(prefix=TMP_PREFIX)
        Path(directory).mkdir(parents=True, exist_ok=True)
        mets_path = Path(directory, mets_basename)
        if mets_path.exists() and not clobber_mets:
            raise FileExistsError("METS '%s' already exists in '%s' and clobber_mets not set." % (mets_basename, directory))
        mets = OcrdMets.empty_mets()
        log.info("Writing METS to %s", mets_path)
        mets_path.write_bytes(mets.to_xml(xmllint=True))

        return Workspace(self, directory, mets)
예제 #7
0
    def workspace_from_nothing(self, directory, mets_basename='mets.xml', clobber_mets=False):
        """
        Create an empty workspace.
        """
        if directory is None:
            directory = tempfile.mkdtemp(prefix=TMP_PREFIX)
        if not os.path.exists(directory):
            os.makedirs(directory)

        mets_fpath = os.path.join(directory, mets_basename)
        if not clobber_mets and os.path.exists(mets_fpath):
            raise Exception("Not clobbering existing mets.xml in '%s'." % directory)
        mets = OcrdMets.empty_mets()
        with open(mets_fpath, 'wb') as fmets:
            log.info("Writing %s", mets_fpath)
            fmets.write(mets.to_xml(xmllint=True))

        return Workspace(self, directory, mets)
예제 #8
0
    def unpack_workspace_from_filename(self, zip_filename, directory=None):
        """

        :TODO:
        Unpack an OCRD-ZIP to a local workspace.

        1. Create directory
        3. Unpack zipfile into it
        4. Initiate workspace

        Args:
            zip_filename (string) : Path to OCRD-ZIP file
        """
        if directory is None:
            directory = tempfile.mkdtemp(prefix=TMP_PREFIX)
        log.debug("Unpacking to %s", directory)
        with ZipFile(zip_filename, 'r') as z:
            z.extractall(path=directory)
        return Workspace(self, directory)
예제 #9
0
    def workspace_from_url(self, mets_url, directory=None):
        """
        Create a workspace from a METS by URL.

        Sets the mets.xml file
        """
        if mets_url is None:
            raise Exception("Must pass mets_url to workspace_from_url")
        if mets_url.find('://') == -1:
            mets_url = 'file://' + mets_url
        if directory is None:
            directory = tempfile.mkdtemp(prefix=TMP_PREFIX)
        log.debug("Creating workspace '%s' for METS @ <%s>", directory,
                  mets_url)
        self.download_to_directory(directory,
                                   mets_url,
                                   basename='mets.xml',
                                   prefer_symlink=False)
        return Workspace(self, directory)
예제 #10
0
 def test_remove_file_page_recursive(self):
     with copy_of_directory(
             assets.path_to(
                 'kant_aufklaerung_1784-complex/data')) as tempdir:
         with pushd_popd(tempdir):
             ws = Workspace(self.resolver, directory=tempdir)
             self.assertEqual(len(ws.mets.find_all_files()), 119)
             ws.remove_file(
                 'OCR-D-OCR-OCRO-fraktur-SEG-LINE-tesseract-ocropy-DEWARP_0001',
                 page_recursive=True,
                 page_same_group=False,
                 keep_file=True)
             self.assertEqual(len(ws.mets.find_all_files()), 83)
             ws.remove_file('PAGE_0017_ALTO', page_recursive=True)
예제 #11
0
    def workspace_from_folder(self,
                              directory,
                              return_mets=False,
                              clobber_mets=False,
                              convention='ocrd-gt'):
        """
        Create a workspace from a folder, creating a METS file.

        Args:
            convention: See add_files_to_mets
            clobber_mets (boolean) : Whether to overwrite existing mets.xml. Default: False.
            return_mets (boolean) : Do not create the actual mets.xml file but return the :class:`OcrdMets`. Default: False.
        """
        if directory is None:
            raise Exception("Must pass directory")
        if not os.path.isdir(directory):
            raise Exception(
                "Directory does not exist or is not a directory: '%s'" %
                directory)
        if not clobber_mets and os.path.exists(
                os.path.join(directory, 'mets.xml')):
            raise Exception("Not clobbering existing mets.xml in '%s'." %
                            directory)

        mets = OcrdMets(content=METS_XML_EMPTY)

        if not os.path.exists(directory):
            os.makedirs(directory)
        directory = os.path.abspath(directory)

        self.add_files_to_mets(convention, mets, directory)

        if return_mets:
            return mets

        #  print(mets.to_xml(xmllint=True).decode('utf-8'))
        mets_fpath = os.path.join(directory, 'mets.xml')
        with open(mets_fpath, 'wb') as fmets:
            log.info("Writing %s", mets_fpath)
            fmets.write(mets.to_xml(xmllint=True))

        return Workspace(self, directory, mets)
예제 #12
0
def test_workspace_remove_groups_unforce(workspace_directory):
    """Remove groups by pattern recursive"""

    # arrange
    original_data = ET.parse(os.path.join(workspace_directory, 'mets.xml')).getroot()
    alto_groups = original_data.findall('.//{http://www.loc.gov/METS/}fileGrp[@USE="OCR-D-GT-ALTO"]')
    assert len(alto_groups) == 1
    altos = alto_groups[0].findall('.//{http://www.loc.gov/METS/}file')
    assert len(altos) == 2

    # act
    resolver = Resolver()
    workspace = Workspace(resolver, workspace_directory)
    workspace.remove_file_group('//OCR-D-GT.*', recursive=True)
    workspace.save_mets()

    # assert
    written_data = ET.parse(os.path.join(workspace_directory, 'mets.xml')).getroot()
    assert written_data is not None
    groups_new = written_data.findall('.//{http://www.loc.gov/METS/}fileGrp[@USE="OCR-D-GT-ALTO"]')
    assert not groups_new
예제 #13
0
    def workspace_from_url(self,
                           mets_url,
                           dst_dir=None,
                           clobber_mets=False,
                           mets_basename=None,
                           download=False,
                           baseurl=None):
        """
        Create a workspace from a METS by URL.

        Sets the mets.xml file

        Arguments:
            mets_url (string): Source mets URL
            dst_dir (string, None): Target directory for the workspace
            clobber_mets (boolean, False): Whether to overwrite existing mets.xml. By default existing mets.xml will raise an exception.
            download (boolean, False): Whether to download all the files
            baseurl (string, None): Base URL for resolving relative file locations

        Returns:
            Workspace
        """
        if dst_dir and not dst_dir.startswith('/'):
            dst_dir = abspath(dst_dir)

        if mets_url is None:
            if baseurl is None:
                raise Exception(
                    "Must pass mets_url and/or baseurl to workspace_from_url")
            else:
                mets_url = 'file://%s/%s' % (baseurl, mets_basename
                                             if mets_basename else 'mets.xml')
        if baseurl is None:
            baseurl = mets_url.rsplit('/', 1)[0]
        log.debug(
            "workspace_from_url\nmets_url='%s'\nbaseurl='%s'\ndst_dir='%s'",
            mets_url, baseurl, dst_dir)

        # resolve to absolute
        if '://' not in mets_url:
            mets_url = 'file://%s' % abspath(mets_url)

        if dst_dir is None:
            # if mets_url is a file-url assume working directory is source directory
            if mets_url.startswith('file://'):
                # if dst_dir was not given and mets_url is a file assume that
                # dst_dir should be the directory where the mets.xml resides
                dst_dir = dirname(mets_url[len('file://'):])
            else:
                dst_dir = tempfile.mkdtemp(prefix=TMP_PREFIX)
                log.debug("Creating workspace '%s' for METS @ <%s>", dst_dir,
                          mets_url)

        # if mets_basename is not given, use the last URL segment of the mets_url
        if mets_basename is None:
            mets_basename = mets_url \
                .rsplit('/', 1)[-1] \
                .split('?')[0] \
                .split('#')[0]

        dst_mets = join(dst_dir, mets_basename)
        log.debug("Copying mets url '%s' to '%s'", mets_url, dst_mets)
        if 'file://' + dst_mets == mets_url:
            log.debug("Target and source mets are identical")
        else:
            if exists(dst_mets) and not clobber_mets:
                raise Exception(
                    "File '%s' already exists but clobber_mets is false" %
                    dst_mets)
            else:
                self.download_to_directory(dst_dir,
                                           mets_url,
                                           basename=mets_basename)

        workspace = Workspace(self,
                              dst_dir,
                              mets_basename=mets_basename,
                              baseurl=baseurl)

        if download:
            for f in workspace.mets.find_files():
                workspace.download_file(f)

        return workspace
예제 #14
0
 def test_remove_file_group_rmdir(self):
     with copy_of_directory(assets.path_to('SBB0000F29300010000/data')) as tempdir:
         workspace = Workspace(self.resolver, directory=tempdir)
         self.assertTrue(exists(join(tempdir, 'OCR-D-IMG')))
         workspace.remove_file_group('OCR-D-IMG', recursive=True)
         self.assertFalse(exists(join(tempdir, 'OCR-D-IMG')))
예제 #15
0
def _fixture_kant_complex(tmp_path):
    copytree(assets.path_to('kant_aufklaerung_1784-complex/data'),
             str(tmp_path))
    yield Workspace(Resolver, directory=tmp_path)
예제 #16
0
def _fixture_sbb_data(sbb_data_tmp):
    resolver = Resolver()
    workspace = Workspace(resolver, directory=sbb_data_tmp)
    yield workspace
예제 #17
0
    def workspace_from_url(self,
                           mets_url,
                           dst_dir=None,
                           clobber_mets=False,
                           mets_basename=None,
                           download=False,
                           src_baseurl=None):
        """
        Create a workspace from a METS by URL (i.e. clone if :py:attr:`mets_url` is remote or :py:attr:`dst_dir` is given).

        Arguments:
            mets_url (string): Source METS URL or filesystem path
        Keyword Arguments:
            dst_dir (string, None): Target directory for the workspace. \
                By default create a temporary directory under :py:data:`ocrd.constants.TMP_PREFIX`. \
                (The resulting path can be retrieved via :py:attr:`ocrd.Workspace.directory`.)
            clobber_mets (boolean, False): Whether to overwrite existing ``mets.xml``. \
                By default existing ``mets.xml`` will raise an exception.
            download (boolean, False): Whether to also download all the files referenced by the METS
            src_baseurl (string, None): Base URL for resolving relative file locations

        Download (clone) :py:attr:`mets_url` to ``mets.xml`` in :py:attr:`dst_dir`, unless 
        the former is already local and the latter is ``none`` or already identical to its directory name.

        Returns:
            a new :py:class:`~ocrd.workspace.Workspace`
        """
        log = getLogger('ocrd.resolver.workspace_from_url')

        if mets_url is None:
            raise ValueError("Must pass 'mets_url' workspace_from_url")

        # if mets_url is a relative filename, make it absolute
        if is_local_filename(mets_url) and not Path(mets_url).is_absolute():
            mets_url = str(Path(Path.cwd() / mets_url))

        # if mets_basename is not given, use the last URL segment of the mets_url
        if mets_basename is None:
            mets_basename = nth_url_segment(mets_url, -1)

        # If src_baseurl wasn't given, determine from mets_url by removing last url segment
        if not src_baseurl:
            last_segment = nth_url_segment(mets_url)
            src_baseurl = remove_non_path_from_url(
                remove_non_path_from_url(mets_url)[:-len(last_segment)])

        # resolve dst_dir
        if not dst_dir:
            if is_local_filename(mets_url):
                log.debug("Deriving dst_dir %s from %s",
                          Path(mets_url).parent, mets_url)
                dst_dir = Path(mets_url).parent
            else:
                log.debug("Creating ephemeral workspace '%s' for METS @ <%s>",
                          dst_dir, mets_url)
                dst_dir = mkdtemp(prefix=TMP_PREFIX)
        # XXX Path.resolve is always strict in Python <= 3.5, so create dst_dir unless it exists consistently
        if not Path(dst_dir).exists():
            Path(dst_dir).mkdir(parents=True, exist_ok=False)
        dst_dir = str(Path(dst_dir).resolve())

        log.debug(
            "workspace_from_url\nmets_basename='%s'\nmets_url='%s'\nsrc_baseurl='%s'\ndst_dir='%s'",
            mets_basename, mets_url, src_baseurl, dst_dir)

        self.download_to_directory(
            dst_dir,
            mets_url,
            basename=mets_basename,
            if_exists='overwrite' if clobber_mets else 'skip')

        workspace = Workspace(self,
                              dst_dir,
                              mets_basename=mets_basename,
                              baseurl=src_baseurl)

        if download:
            for f in workspace.mets.find_files():
                workspace.download_file(f)

        return workspace
예제 #18
0
파일: resolver.py 프로젝트: cclauss/core
    def workspace_from_url(self,
                           mets_url,
                           dst_dir=None,
                           clobber_mets=False,
                           mets_basename=None,
                           download=False,
                           src_baseurl=None):
        """
        Create a workspace from a METS by URL (i.e. clone it).

        Sets the mets.xml file

        Arguments:
            mets_url (string): Source mets URL
            dst_dir (string, None): Target directory for the workspace
            clobber_mets (boolean, False): Whether to overwrite existing mets.xml. By default existing mets.xml will raise an exception.
            download (boolean, False): Whether to download all the files
            src_baseurl (string, None): Base URL for resolving relative file locations

        Returns:
            Workspace
        """
        log = getLogger('ocrd.resolver.workspace_from_url')

        if mets_url is None:
            raise ValueError("Must pass 'mets_url' workspace_from_url")

        # if mets_url is a relative filename, make it absolute
        if is_local_filename(mets_url) and not Path(mets_url).is_absolute():
            mets_url = str(Path(Path.cwd() / mets_url))

        # if mets_basename is not given, use the last URL segment of the mets_url
        if mets_basename is None:
            mets_basename = nth_url_segment(mets_url, -1)

        # If src_baseurl wasn't given, determine from mets_url by removing last url segment
        if not src_baseurl:
            last_segment = nth_url_segment(mets_url)
            src_baseurl = remove_non_path_from_url(
                remove_non_path_from_url(mets_url)[:-len(last_segment)])

        # resolve dst_dir
        if not dst_dir:
            if is_local_filename(mets_url):
                log.debug("Deriving dst_dir %s from %s",
                          Path(mets_url).parent, mets_url)
                dst_dir = Path(mets_url).parent
            else:
                log.debug("Creating ephemeral workspace '%s' for METS @ <%s>",
                          dst_dir, mets_url)
                dst_dir = tempfile.mkdtemp(prefix=TMP_PREFIX)
        # XXX Path.resolve is always strict in Python <= 3.5, so create dst_dir unless it exists consistently
        if not Path(dst_dir).exists():
            Path(dst_dir).mkdir(parents=True, exist_ok=False)
        dst_dir = str(Path(dst_dir).resolve())

        log.debug(
            "workspace_from_url\nmets_basename='%s'\nmets_url='%s'\nsrc_baseurl='%s'\ndst_dir='%s'",
            mets_basename, mets_url, src_baseurl, dst_dir)

        self.download_to_directory(
            dst_dir,
            mets_url,
            basename=mets_basename,
            if_exists='overwrite' if clobber_mets else 'skip')

        workspace = Workspace(self,
                              dst_dir,
                              mets_basename=mets_basename,
                              baseurl=src_baseurl)

        if download:
            for f in workspace.mets.find_files():
                workspace.download_file(f)

        return workspace