Пример #1
0
    def workspace_from_url(self, mets_url, dst_dir=None, clobber_mets=False, mets_basename=None, download=False, src_baseurl=None):
        """
        Create a workspace from a METS by URL (i.e. clone it).

        Sets the mets.xml file

        Arguments:
            mets_url (string): Source mets URL
            dst_dir (string, None): Target directory for the workspace
            clobber_mets (boolean, False): Whether to overwrite existing mets.xml. By default existing mets.xml will raise an exception.
            download (boolean, False): Whether to download all the files
            src_baseurl (string, None): Base URL for resolving relative file locations

        Returns:
            Workspace
        """

        if mets_url is None:
            raise ValueError("Must pass 'mets_url' workspace_from_url")

        # if mets_url is a relative filename, make it absolute
        if is_local_filename(mets_url) and not Path(mets_url).is_absolute():
            mets_url = str(Path(Path.cwd() / mets_url))

        # if mets_basename is not given, use the last URL segment of the mets_url
        if mets_basename is None:
            mets_basename = nth_url_segment(mets_url, -1)

        # If src_baseurl wasn't given, determine from mets_url by removing last url segment
        if not src_baseurl:
            last_segment = nth_url_segment(mets_url)
            src_baseurl = remove_non_path_from_url(remove_non_path_from_url(mets_url)[:-len(last_segment)])

        # resolve dst_dir
        if not dst_dir:
            if is_local_filename(mets_url):
                log.debug("Deriving dst_dir %s from %s", Path(mets_url).parent, mets_url)
                dst_dir = Path(mets_url).parent
            else:
                log.debug("Creating ephemeral workspace '%s' for METS @ <%s>", dst_dir, mets_url)
                dst_dir = tempfile.mkdtemp(prefix=TMP_PREFIX)
        # XXX Path.resolve is always strict in Python <= 3.5, so create dst_dir unless it exists consistently
        if not Path(dst_dir).exists():
            Path(dst_dir).mkdir(parents=True, exist_ok=False)
        dst_dir = str(Path(dst_dir).resolve())

        log.debug("workspace_from_url\nmets_basename='%s'\nmets_url='%s'\nsrc_baseurl='%s'\ndst_dir='%s'",
            mets_basename, mets_url, src_baseurl, dst_dir)

        self.download_to_directory(dst_dir, mets_url, basename=mets_basename, if_exists='overwrite' if clobber_mets else 'raise')

        workspace = Workspace(self, dst_dir, mets_basename=mets_basename, baseurl=src_baseurl)

        if download:
            for f in workspace.mets.find_files():
                workspace.download_file(f)

        return workspace
Пример #2
0
 def _validate_imagefilename(self):
     """
     Validate that the imageFilename is correctly set to a filename relative to the workspace
     """
     self.log.debug('_validate_imagefilename')
     for f in self.mets.find_files(mimetype=MIMETYPE_PAGE):
         if not is_local_filename(f.url) and not self.download:
             self.report.add_notice("Won't download remote PAGE XML <%s>" % f.url)
             continue
         self.workspace.download_file(f)
         page = page_from_file(f).get_Page()
         imageFilename = page.imageFilename
         if not self.mets.find_files(url=imageFilename):
             self.report.add_error("PAGE-XML %s : imageFilename '%s' not found in METS" % (f.url, imageFilename))
         if is_local_filename(imageFilename) and not Path(imageFilename).exists():
             self.report.add_warning("PAGE-XML %s : imageFilename '%s' points to non-existent local file" % (f.url, imageFilename))
Пример #3
0
def bashlib_input_files(**kwargs):
    """
    List input files for processing

    Instantiate a processor and workspace from the given processing options.
    Then loop through the input files of the input fileGrp, and for each one,
    print its `url`, `ID`, `mimetype` and `pageId`, as well as its recommended
    `outputFileId` (from ``make_file_id``).

    (The printing format is one associative array initializer per line.)
    """
    initLogging()
    mets = kwargs.pop('mets')
    working_dir = kwargs.pop('working_dir')
    if is_local_filename(mets) and not isfile(get_local_filename(mets)):
        msg = "File does not exist: %s" % mets
        raise Exception(msg)
    resolver = Resolver()
    workspace = resolver.workspace_from_url(mets, working_dir)
    processor = Processor(workspace,
                          ocrd_tool=None,
                          page_id=kwargs['page_id'],
                          input_file_grp=kwargs['input_file_grp'],
                          output_file_grp=kwargs['output_file_grp'])
    for input_file in processor.input_files:
        for field in ['url', 'ID', 'mimetype', 'pageId']:
            # make this bash-friendly (show initialization for associative array)
            print("[%s]='%s'" % (field, getattr(input_file, field)), end=' ')
        print("[outputFileId]='%s'" %
              make_file_id(input_file, kwargs['output_file_grp']))
Пример #4
0
def ocrd_cli_wrap_processor(processorClass,
                            ocrd_tool=None,
                            mets=None,
                            working_dir=None,
                            dump_json=False,
                            version=False,
                            **kwargs):
    LOG = getLogger('ocrd_cli_wrap_processor')
    if dump_json:
        processorClass(workspace=None, dump_json=True)
    elif version:
        try:
            p = processorClass(workspace=None)
        except e:
            pass
        print("Version %s, ocrd/core %s" % (p.version, OCRD_VERSION))
    elif mets is None:
        msg = 'Error: Missing option "-m" / "--mets".'
        LOG.error(msg)
        raise Exception(msg)
    else:
        if is_local_filename(mets) and not isfile(get_local_filename(mets)):
            msg = "File does not exist: %s" % mets
            LOG.error(msg)
            raise Exception(msg)
        resolver = Resolver()
        workspace = resolver.workspace_from_url(mets, working_dir)
        run_processor(processorClass,
                      ocrd_tool,
                      mets,
                      workspace=workspace,
                      **kwargs)
Пример #5
0
    def find_files(self,
                   ID=None,
                   fileGrp=None,
                   pageId=None,
                   mimetype=None,
                   url=None,
                   local_only=False):
        """
        Search ``mets:file`` in this METS document.
        Args:
            ID (string) : ID of the file
            fileGrp (string) : USE of the fileGrp to list files of
            pageId (string) : ID of physical page manifested by matching files
            url (string) : @xlink:href of mets:Flocat of mets:file
            mimetype (string) : MIMETYPE of matching files
            local (boolean) : Whether to restrict results to local files

        Return:
            List of files.
        """
        ret = []
        fileGrp_clause = '' if fileGrp is None else '[@USE="%s"]' % fileGrp
        file_clause = ''
        if ID is not None:
            file_clause += '[@ID="%s"]' % ID
        if mimetype is not None:
            file_clause += '[@MIMETYPE="%s"]' % mimetype
        if url is not None:
            file_clause += '[mets:FLocat[@xlink:href = "%s"]]' % url
        # TODO lxml says invalid predicate. I disagree
        #  if local_only:
        #      file_clause += "[mets:FLocat[starts-with(@xlink:href, 'file://')]]"

        # Search
        file_ids = self._tree.getroot().xpath(
            "//mets:fileGrp%s/mets:file%s/@ID" % (fileGrp_clause, file_clause),
            namespaces=NS)
        if pageId is not None:
            by_pageid = self._tree.getroot().xpath(
                '//mets:div[@TYPE="page"][@ID="%s"]/mets:fptr/@FILEID' %
                pageId,
                namespaces=NS)
            file_ids = [i for i in by_pageid if i in file_ids]

        # instantiate / get from cache
        for file_id in file_ids:
            el = self._tree.getroot().find('.//mets:file[@ID="%s"]' % file_id,
                                           NS)
            if file_id not in self._file_by_id:
                self._file_by_id[file_id] = OcrdFile(el, mets=self)

            # If only local resources should be returned and file is not a file path: skip the file
            url = self._file_by_id[file_id].url
            if local_only and not is_local_filename(url):
                continue
            ret.append(self._file_by_id[file_id])
        return ret
Пример #6
0
 def _validate_dimension(self):
     """
     Validate image height and PAGE imageHeight match
     """
     self.log.info('_validate_dimension')
     for f in self.mets.find_files(mimetype=MIMETYPE_PAGE):
         if not is_local_filename(f.url) and not self.download:
             self.report.add_notice("_validate_dimension: Not executed because --download wasn't set and PAGE might reference remote (Alternative)Images <%s>" % f.url)
             continue
         page = page_from_file(f).get_Page()
         _, _, exif = self.workspace.image_from_page(page, f.pageId)
         if page.imageHeight != exif.height:
             self.report.add_error("PAGE '%s': @imageHeight != image's actual height (%s != %s)" % (f.ID, page.imageHeight, exif.height))
         if page.imageWidth != exif.width:
             self.report.add_error("PAGE '%s': @imageWidth != image's actual width (%s != %s)" % (f.ID, page.imageWidth, exif.width))
Пример #7
0
    def _validate_pixel_density(self):
        """
        Validate image pixel density

        See `spec <https://ocr-d.github.io/mets#pixel-density-of-images-must-be-explicit-and-high-enough>`_.
        """
        self.log.debug('_validate_pixel_density')
        for f in [f for f in self.mets.find_files() if f.mimetype.startswith('image/')]:
            if not is_local_filename(f.url) and not self.download:
                self.report.add_notice("Won't download remote image <%s>" % f.url)
                continue
            exif = self.workspace.resolve_image_exif(f.url)
            for k in ['xResolution', 'yResolution']:
                v = exif.__dict__.get(k)
                if v is None or v <= 72:
                    self.report.add_notice("Image %s: %s (%s pixels per %s) is suspiciously low" % (f.ID, k, v, exif.resolutionUnit))
Пример #8
0
    def _validate_multipage(self):
        """
        Validate the number of images per file is 1 (TIFF allows multi-page images)

        See `spec <https://ocr-d.github.io/mets#no-multi-page-images>`_.
        """
        for f in [
                f for f in self.mets.find_files()
                if f.mimetype.startswith('image/')
        ]:
            if not is_local_filename(f.url) and not self.download:
                self.report.add_notice("Won't download remote image <%s>" %
                                       f.url)
                continue
            exif = self.workspace.resolve_image_exif(f.url)
            if exif.n_frames > 1:
                self.report.add_error("Image %s: More than 1 frame: %s" %
                                      (f.ID, exif.n_frames))
Пример #9
0
    def _bag_mets_files(self, workspace, bagdir, ocrd_manifestation_depth,
                        ocrd_mets, processes):
        mets = workspace.mets

        # TODO allow filtering by fileGrp@USE and such
        oldpwd = getcwd()
        chdir(workspace.directory)
        for f in mets.find_files():
            log.info("Resolving %s (%s)", f.url, ocrd_manifestation_depth)
            if is_local_filename(f.url):
                f.url = abspath(f.url)
            # XXX cannot happen because chdir above
            #  elif is_local_filename(join(workspace.directory, 'data', f.url)):
            #      f.url = abspath(join(workspace.directory, 'data', f.url))
            elif ocrd_manifestation_depth != 'full':
                self._log_or_raise(
                    "Not fetching non-local files, skipping %s" % f.url,
                    oldpwd)
                continue
            elif not f.url.startswith('http'):
                self._log_or_raise("Not an http URL: %s" % f.url, oldpwd)
                continue
            log.info("Resolved %s", f.url)

            file_grp_dir = join(bagdir, 'data', f.fileGrp)
            if not isdir(file_grp_dir):
                makedirs(file_grp_dir)
            self.resolver.download_to_directory(file_grp_dir,
                                                f.url,
                                                basename=f.ID)
            f.url = join(f.fileGrp, f.ID)

        # save mets.xml
        with open(join(bagdir, 'data', ocrd_mets), 'wb') as f:
            f.write(workspace.mets.to_xml())

        chdir(bagdir)
        total_bytes, total_files = make_manifests('data',
                                                  processes,
                                                  algorithms=['sha512'])
        chdir(oldpwd)
        return total_bytes, total_files
Пример #10
0
def ocrd_cli_wrap_processor(processorClass,
                            ocrd_tool=None,
                            mets=None,
                            working_dir=None,
                            dump_json=False,
                            help=False,
                            version=False,
                            **kwargs):
    LOG = getLogger('ocrd_cli_wrap_processor')
    if dump_json:
        processorClass(workspace=None, dump_json=True)
    elif help:
        processorClass(workspace=None, show_help=True)
    elif version:
        processorClass(workspace=None, show_version=True)
    elif mets is None:
        msg = 'Error: Missing option "-m" / "--mets".'
        LOG.error(msg)
        raise Exception(msg)
    else:
        if is_local_filename(mets) and not isfile(get_local_filename(mets)):
            msg = "File does not exist: %s" % mets
            LOG.error(msg)
            raise Exception(msg)
        resolver = Resolver()
        workspace = resolver.workspace_from_url(mets, working_dir)
        # TODO once we implement 'overwrite' CLI option and mechanism, disable the
        # `output_file_grp_ check by setting to False-y value if 'overwrite' is set
        report = WorkspaceValidator.check_file_grp(workspace,
                                                   kwargs['input_file_grp'],
                                                   kwargs['output_file_grp'])
        if not report.is_valid:
            raise Exception("Invalid input/output file grps:\n\t%s" %
                            '\n\t'.join(report.errors))
        run_processor(processorClass,
                      ocrd_tool,
                      mets,
                      workspace=workspace,
                      **kwargs)
Пример #11
0
    def __init__(self,
                 el,
                 mimetype=None,
                 pageId=None,
                 loctype='OTHER',
                 local_filename=None,
                 mets=None,
                 url=None,
                 ID=None):
        """
        Args:
            el (LxmlElement): etree Element of the ``mets:file`` this represents. Create new if not provided
        Keyword Args:
            mets (OcrdMets): Containing :py:class:`ocrd_models.ocrd_mets.OcrdMets`.
            mimetype (string): ``@MIMETYPE`` of this ``mets:file``
            pageId (string): ``@ID`` of the physical ``mets:structMap`` entry corresponding to this ``mets:file``
            loctype (string): ``@LOCTYPE`` of this ``mets:file``
            local_filename (string): Local filename
            url (string): ``@xlink:href`` of this ``mets:file``
            ID (string): ``@ID`` of this ``mets:file``
        """
        if el is None:
            raise ValueError(
                "Must provide mets:file element this OcrdFile represents")
        self._el = el
        self.mets = mets
        self.ID = ID
        self.mimetype = mimetype
        self.local_filename = local_filename
        self.loctype = loctype
        self.pageId = pageId

        if url:
            self.url = url

        if not (local_filename):
            if self.url and is_local_filename(self.url):
                self.local_filename = get_local_filename(self.url)
Пример #12
0
    def download_file(self, f):
        """
        Download a :py:mod:`ocrd.model.ocrd_file.OcrdFile` to the workspace.
        """
        #  os.chdir(self.directory)
        #  log.info('f=%s' % f)
        oldpwd = os.getcwd()
        try:
            os.chdir(self.directory)
            if is_local_filename(f.url):
                f.local_filename = abspath(f.url)
            else:
                if f.local_filename:
                    log.debug("Already downloaded: %s", f.local_filename)
                else:
                    f.local_filename = self.download_url(f.url,
                                                         basename='%s/%s' %
                                                         (f.fileGrp, f.ID))
        finally:
            os.chdir(oldpwd)

        #  print(f)
        return f
Пример #13
0
    def __init__(self,
                 el,
                 mimetype=None,
                 pageId=None,
                 loctype='OTHER',
                 local_filename=None,
                 mets=None,
                 url=None,
                 ID=None):
        """
        Args:
            el (LxmlElement): etree Element of the mets:file this represents. Create new if not provided
            mimetype (string): MIME type of the file
            pageId (string): ID of the physical page
            loctype (string): METS @LOCTYPE
            local_filename (string): Local filename
            mets (OcrdMets): Containing OcrdMets
            url (string): xlink:href of the file
            ID (string): @ID of the mets:file
        """
        if el is None:
            el = ET.Element(TAG_METS_FILE)
        self._el = el
        self.mets = mets
        self.ID = ID
        self.mimetype = mimetype
        self.local_filename = local_filename
        self.loctype = loctype
        self.pageId = pageId

        if url:
            self.url = url

        if not (local_filename):
            if self.url and is_local_filename(self.url):
                self.local_filename = get_local_filename(self.url)
Пример #14
0
def ocrd_cli_wrap_processor(
        processorClass,
        ocrd_tool=None,
        mets=None,
        working_dir=None,
        dump_json=False,
        help=False,  # pylint: disable=redefined-builtin
        version=False,
        overwrite=False,
        **kwargs):
    if not sys.argv[1:]:
        processorClass(workspace=None, show_help=True)
        sys.exit(1)
    if dump_json or help or version:
        processorClass(workspace=None,
                       dump_json=dump_json,
                       show_help=help,
                       show_version=version)
        sys.exit()
    else:
        initLogging()
        LOG = getLogger('ocrd_cli_wrap_processor')
        # LOG.info('kwargs=%s' % kwargs)
        # Merge parameter overrides and parameters
        if 'parameter_override' in kwargs:
            set_json_key_value_overrides(kwargs['parameter'],
                                         *kwargs['parameter_override'])
        # TODO OCR-D/core#274
        # Assert -I / -O
        # if not kwargs['input_file_grp']:
        #     raise ValueError('-I/--input-file-grp is required')
        # if not kwargs['output_file_grp']:
        #     raise ValueError('-O/--output-file-grp is required')
        if is_local_filename(mets) and not isfile(get_local_filename(mets)):
            msg = "File does not exist: %s" % mets
            LOG.error(msg)
            raise Exception(msg)
        resolver = Resolver()
        workspace = resolver.workspace_from_url(mets, working_dir)
        page_id = kwargs.get('page_id')
        # XXX not possible while processors do not adhere to # https://github.com/OCR-D/core/issues/505
        # if overwrite
        #     if 'output_file_grp' not in kwargs or not kwargs['output_file_grp']:
        #         raise Exception("--overwrite requires --output-file-grp")
        #     LOG.info("Removing files because of --overwrite")
        #     for grp in kwargs['output_file_grp'].split(','):
        #         if page_id:
        #             for one_page_id in kwargs['page_id'].split(','):
        #                 LOG.debug("Removing files in output file group %s with page ID %s", grp, one_page_id)
        #                 for file in workspace.mets.find_files(pageId=one_page_id, fileGrp=grp):
        #                     workspace.remove_file(file, force=True, keep_file=False, page_recursive=True)
        #         else:
        #             LOG.debug("Removing all files in output file group %s ", grp)
        #             # TODO: can be reduced to `page_same_group=True` as soon as core#505 has landed (in all processors)
        #             workspace.remove_file_group(grp, recursive=True, force=True, keep_files=False, page_recursive=True, page_same_group=False)
        #     workspace.save_mets()
        # XXX While https://github.com/OCR-D/core/issues/505 is open, set 'overwrite_mode' globally on the workspace
        if overwrite:
            workspace.overwrite_mode = True
        report = WorkspaceValidator.check_file_grp(
            workspace, kwargs['input_file_grp'],
            '' if overwrite else kwargs['output_file_grp'], page_id)
        if not report.is_valid:
            raise Exception("Invalid input/output file grps:\n\t%s" %
                            '\n\t'.join(report.errors))
        run_processor(processorClass,
                      ocrd_tool,
                      mets,
                      workspace=workspace,
                      **kwargs)
Пример #15
0
    def download_to_directory(self, directory, url, basename=None, if_exists='skip', subdir=None):
        """
        Download a file to a directory.

        Early Shortcut: If url is a local file and that file is already in the directory, keep it there.

        If basename is not given but subdir is, assume user knows what she's doing and use last URL segment as the basename.
        If basename is not given and no subdir is given, use the alnum characters in the URL as the basename.

        Args:
            directory (string): Directory to download files to
            basename (string, None): basename part of the filename on disk.
            url (string): URL to download from
            if_exists (string, "skip"): What to do if target file already exists. One of ``skip`` (default), ``overwrite`` or ``raise``
            subdir (string, None): Subdirectory to create within the directory. Think fileGrp.

        Returns:
            Local filename, __relative__ to directory
        """
        log = getLogger('ocrd.resolver.download_to_directory') # pylint: disable=redefined-outer-name
        log.debug("directory=|%s| url=|%s| basename=|%s| if_exists=|%s| subdir=|%s|", directory, url, basename, if_exists, subdir)

        if not url:
            raise Exception("'url' must be a string")
        if not directory:
            raise Exception("'directory' must be a string")  # acutally Path would also work

        directory = Path(directory)
        directory.mkdir(parents=True, exist_ok=True)
        directory = str(directory.resolve())

        subdir_path = Path(subdir if subdir else '')
        basename_path = Path(basename if basename else nth_url_segment(url))
        ret = str(Path(subdir_path, basename_path))
        dst_path = Path(directory, ret)

        #  log.info("\n\tdst_path='%s \n\turl=%s", dst_path, url)
        #  print('url=%s', url)
        #  print('directory=%s', directory)
        #  print('subdir_path=%s', subdir_path)
        #  print('basename_path=%s', basename_path)
        #  print('ret=%s', ret)
        #  print('dst_path=%s', dst_path)

        src_path = None
        if is_local_filename(url):
            try:
                # XXX this raises FNFE in Python 3.5 if src_path doesn't exist but not 3.6+
                src_path = Path(get_local_filename(url)).resolve()
            except FileNotFoundError as e:
                log.error("Failed to resolve URL locally: %s --> '%s' which does not exist" % (url, src_path))
                raise e
            if not src_path.exists():
                raise FileNotFoundError("File path passed as 'url' to download_to_directory does not exist: %s" % url)
            if src_path == dst_path:
                log.debug("Stop early, src_path and dst_path are the same: '%s' (url: '%s')" % (src_path, url))
                return ret

        # Respect 'if_exists' arg
        if dst_path.exists():
            if if_exists == 'skip':
                return ret
            if if_exists == 'raise':
                raise FileExistsError("File already exists and if_exists == 'raise': %s" % (dst_path))

        # Create dst_path parent dir
        dst_path.parent.mkdir(parents=True, exist_ok=True)

        # Copy files or download remote assets
        if src_path:
            log.debug("Copying file '%s' to '%s'", src_path, dst_path)
            dst_path.write_bytes(src_path.read_bytes())
        else:
            log.debug("Downloading URL '%s' to '%s'", url, dst_path)
            response = requests.get(url)
            if response.status_code != 200:
                raise Exception("HTTP request failed: %s (HTTP %d)" % (url, response.status_code))
            dst_path.write_bytes(response.content)

        return ret
Пример #16
0
    def find_files(self, ID=None, fileGrp=None, pageId=None, mimetype=None, url=None, local_only=False):
        """
        Search ``mets:file`` entries in this METS document and yield results.


        The :py:attr:`ID`, :py:attr:`fileGrp`, :py:attr:`url` and :py:attr:`mimetype`
        parameters can each be either a literal string, or a regular expression if
        the string starts with ``//`` (double slash).

        If it is a regex, the leading ``//`` is removed and candidates are matched
        against the regex with `re.fullmatch`. If it is a literal string, comparison
        is done with string equality.

        The :py:attr:`pageId` parameter supports the numeric range operator ``..``. For
        example, to find all files in pages ``PHYS_0001`` to ``PHYS_0003``, 
        ``PHYS_0001..PHYS_0003`` will be expanded to ``PHYS_0001,PHYS_0002,PHYS_0003``.

        Keyword Args:
            ID (string) : ``@ID`` of the ``mets:file``
            fileGrp (string) : ``@USE`` of the ``mets:fileGrp`` to list files of
            pageId (string) : ``@ID`` of the corresponding physical ``mets:structMap`` entry (physical page)
            url (string) : ``@xlink:href`` (URL or path) of ``mets:Flocat`` of ``mets:file``
            mimetype (string) : ``@MIMETYPE`` of ``mets:file``
            local (boolean) : Whether to restrict results to local files in the filesystem

        Yields:
            :py:class:`ocrd_models:ocrd_file:OcrdFile` instantiations
        """
        ret = []
        if pageId:
            if pageId.startswith(REGEX_PREFIX):
                raise Exception("find_files does not support regex search for pageId")
            pageIds, pageId = pageId.split(','), list()
            pageIds_expanded = []
            for pageId_ in pageIds:
                if '..' in pageId_:
                    pageIds_expanded += generate_range(*pageId_.split('..', 2))
            pageIds += pageIds_expanded
            for page in self._tree.getroot().xpath(
                '//mets:div[@TYPE="page"]', namespaces=NS):
                if page.get('ID') in pageIds:
                    pageId.extend(
                        [fptr.get('FILEID') for fptr in page.findall('mets:fptr', NS)])
        for cand in self._tree.getroot().xpath('//mets:file', namespaces=NS):
            if ID:
                if ID.startswith(REGEX_PREFIX):
                    if not fullmatch(ID[REGEX_PREFIX_LEN:], cand.get('ID')): continue
                else:
                    if not ID == cand.get('ID'): continue

            if pageId is not None and cand.get('ID') not in pageId:
                continue

            if fileGrp:
                if fileGrp.startswith(REGEX_PREFIX):
                    if not fullmatch(fileGrp[REGEX_PREFIX_LEN:], cand.getparent().get('USE')): continue
                else:
                    if cand.getparent().get('USE') != fileGrp: continue

            if mimetype:
                if mimetype.startswith(REGEX_PREFIX):
                    if not fullmatch(mimetype[REGEX_PREFIX_LEN:], cand.get('MIMETYPE') or ''): continue
                else:
                    if cand.get('MIMETYPE') != mimetype: continue

            if url:
                cand_locat = cand.find('mets:FLocat', namespaces=NS)
                if cand_locat is None:
                    continue
                cand_url = cand_locat.get('{%s}href' % NS['xlink'])
                if url.startswith(REGEX_PREFIX):
                    if not fullmatch(url[REGEX_PREFIX_LEN:], cand_url): continue
                else:
                    if cand_url != url: continue

            f = OcrdFile(cand, mets=self)

            # If only local resources should be returned and f is not a file path: skip the file
            if local_only and not is_local_filename(f.url):
                continue
            yield f
Пример #17
0
 def test_is_local_filename(self):
     self.assertEqual(is_local_filename('file:///'), True)
Пример #18
0
    def find_files(self,
                   ID=None,
                   fileGrp=None,
                   pageId=None,
                   mimetype=None,
                   url=None,
                   local_only=False):
        """
        Search ``mets:file`` in this METS document.


        The ``ID``, ``fileGrp``, ``url`` and ``mimetype`` parameters can be
        either a literal string or a regular expression if the string starts
        with ``//`` (double slash). If it is a regex, the leading ``//`` is removed
        and candidates are matched against the regex with ``re.fullmatch``. If it is
        a literal string, comparison is done with string equality.

        Args:
            ID (string) : ID of the file
            fileGrp (string) : USE of the fileGrp to list files of
            pageId (string) : ID of physical page manifested by matching files
            url (string) : @xlink:href of mets:Flocat of mets:file
            mimetype (string) : MIMETYPE of matching files
            local (boolean) : Whether to restrict results to local files

        Return:
            List of files.
        """
        ret = []
        if pageId:
            if pageId.startswith(REGEX_PREFIX):
                raise Exception(
                    "find_files does not support regex search for pageId")
            pageIds, pageId = pageId.split(','), list()
            for page in self._tree.getroot().xpath('//mets:div[@TYPE="page"]',
                                                   namespaces=NS):
                if page.get('ID') in pageIds:
                    pageId.extend([
                        fptr.get('FILEID')
                        for fptr in page.findall('mets:fptr', NS)
                    ])
        for cand in self._tree.getroot().xpath('//mets:file', namespaces=NS):
            if ID:
                if ID.startswith(REGEX_PREFIX):
                    if not fullmatch(ID[REGEX_PREFIX_LEN:], cand.get('ID')):
                        continue
                else:
                    if not ID == cand.get('ID'): continue

            if pageId is not None and cand.get('ID') not in pageId:
                continue

            if fileGrp:
                if fileGrp.startswith(REGEX_PREFIX):
                    if not fullmatch(fileGrp[REGEX_PREFIX_LEN:],
                                     cand.getparent().get('USE')):
                        continue
                else:
                    if cand.getparent().get('USE') != fileGrp: continue

            if mimetype:
                if mimetype.startswith(REGEX_PREFIX):
                    if not fullmatch(mimetype[REGEX_PREFIX_LEN:],
                                     cand.get('MIMETYPE') or ''):
                        continue
                else:
                    if cand.get('MIMETYPE') != mimetype: continue

            if url:
                cand_url = cand.find('mets:FLocat', namespaces=NS).get(
                    '{%s}href' % NS['xlink'])
                if url.startswith(REGEX_PREFIX):
                    if not fullmatch(url[REGEX_PREFIX_LEN:], cand_url):
                        continue
                else:
                    if cand_url != url: continue

            f = OcrdFile(cand, mets=self)

            # If only local resources should be returned and f is not a file path: skip the file
            if local_only and not is_local_filename(f.url):
                continue
            ret.append(f)
        return ret
Пример #19
0
 def test_is_local_filename(self):
     self.assertTrue(is_local_filename('/foo/bar'))
     self.assertTrue(is_local_filename('file:///foo/bar'))
     self.assertTrue(is_local_filename('file:/foo/bar'))
     self.assertTrue(is_local_filename('foo/bar'))
     self.assertFalse(is_local_filename('bad-scheme://foo/bar'))
Пример #20
0
    def _bag_mets_files(self, workspace, bagdir, ocrd_manifestation_depth,
                        ocrd_mets, processes):
        mets = workspace.mets
        changed_urls = {}

        # TODO allow filtering by fileGrp@USE and such
        with pushd_popd(workspace.directory):
            # URLs of the files before changing
            for f in mets.find_files():
                log.info("Resolving %s (%s)", f.url, ocrd_manifestation_depth)
                if is_local_filename(f.url):
                    # nothing to do then
                    pass
                elif ocrd_manifestation_depth != 'full':
                    self._log_or_raise(
                        "Not fetching non-local files, skipping %s" % f.url)
                    continue
                elif not f.url.startswith('http'):
                    self._log_or_raise("Not an http URL: %s" % f.url)
                    continue
                log.info("Resolved %s", f.url)

                file_grp_dir = join(bagdir, 'data', f.fileGrp)
                if not isdir(file_grp_dir):
                    makedirs(file_grp_dir)

                _basename = "%s%s" % (f.ID, f.extension)
                _relpath = join(f.fileGrp, _basename)
                self.resolver.download_to_directory(file_grp_dir,
                                                    f.url,
                                                    basename=_basename)
                changed_urls[f.url] = _relpath
                f.url = _relpath

            # save mets.xml
            with open(join(bagdir, 'data', ocrd_mets), 'wb') as f:
                f.write(workspace.mets.to_xml())

        # Walk through bagged workspace and fix the PAGE
        # Page/@imageFilename and
        # AlternativeImage/@filename
        bag_workspace = Workspace(self.resolver,
                                  directory=join(bagdir, 'data'))
        with pushd_popd(bag_workspace.directory):
            for page_file in bag_workspace.mets.find_files(
                    mimetype=MIMETYPE_PAGE):
                pcgts = page_from_file(page_file)
                changed = False
                #  page_doc.set(imageFileName
                #  for old, new in changed_urls:
                for old, new in changed_urls.items():
                    if pcgts.get_Page().imageFilename == old:
                        pcgts.get_Page().imageFilename = new
                        changed = True
                    # TODO replace AlternativeImage, recursively...
                if changed:
                    with open(page_file.url, 'w') as out:
                        out.write(to_xml(pcgts))
                    #  log.info("Replace %s -> %s in %s" % (old, new, page_file))

            chdir(bagdir)
            total_bytes, total_files = make_manifests('data',
                                                      processes,
                                                      algorithms=['sha512'])
            log.info("New vs. old: %s" % changed_urls)
        return total_bytes, total_files
Пример #21
0
    def workspace_from_url(self,
                           mets_url,
                           dst_dir=None,
                           clobber_mets=False,
                           mets_basename=None,
                           download=False,
                           src_baseurl=None):
        """
        Create a workspace from a METS by URL (i.e. clone if :py:attr:`mets_url` is remote or :py:attr:`dst_dir` is given).

        Arguments:
            mets_url (string): Source METS URL or filesystem path
        Keyword Arguments:
            dst_dir (string, None): Target directory for the workspace. \
                By default create a temporary directory under :py:data:`ocrd.constants.TMP_PREFIX`. \
                (The resulting path can be retrieved via :py:attr:`ocrd.Workspace.directory`.)
            clobber_mets (boolean, False): Whether to overwrite existing ``mets.xml``. \
                By default existing ``mets.xml`` will raise an exception.
            download (boolean, False): Whether to also download all the files referenced by the METS
            src_baseurl (string, None): Base URL for resolving relative file locations

        Download (clone) :py:attr:`mets_url` to ``mets.xml`` in :py:attr:`dst_dir`, unless 
        the former is already local and the latter is ``none`` or already identical to its directory name.

        Returns:
            a new :py:class:`~ocrd.workspace.Workspace`
        """
        log = getLogger('ocrd.resolver.workspace_from_url')

        if mets_url is None:
            raise ValueError("Must pass 'mets_url' workspace_from_url")

        # if mets_url is a relative filename, make it absolute
        if is_local_filename(mets_url) and not Path(mets_url).is_absolute():
            mets_url = str(Path(Path.cwd() / mets_url))

        # if mets_basename is not given, use the last URL segment of the mets_url
        if mets_basename is None:
            mets_basename = nth_url_segment(mets_url, -1)

        # If src_baseurl wasn't given, determine from mets_url by removing last url segment
        if not src_baseurl:
            last_segment = nth_url_segment(mets_url)
            src_baseurl = remove_non_path_from_url(
                remove_non_path_from_url(mets_url)[:-len(last_segment)])

        # resolve dst_dir
        if not dst_dir:
            if is_local_filename(mets_url):
                log.debug("Deriving dst_dir %s from %s",
                          Path(mets_url).parent, mets_url)
                dst_dir = Path(mets_url).parent
            else:
                log.debug("Creating ephemeral workspace '%s' for METS @ <%s>",
                          dst_dir, mets_url)
                dst_dir = mkdtemp(prefix=TMP_PREFIX)
        # XXX Path.resolve is always strict in Python <= 3.5, so create dst_dir unless it exists consistently
        if not Path(dst_dir).exists():
            Path(dst_dir).mkdir(parents=True, exist_ok=False)
        dst_dir = str(Path(dst_dir).resolve())

        log.debug(
            "workspace_from_url\nmets_basename='%s'\nmets_url='%s'\nsrc_baseurl='%s'\ndst_dir='%s'",
            mets_basename, mets_url, src_baseurl, dst_dir)

        self.download_to_directory(
            dst_dir,
            mets_url,
            basename=mets_basename,
            if_exists='overwrite' if clobber_mets else 'skip')

        workspace = Workspace(self,
                              dst_dir,
                              mets_basename=mets_basename,
                              baseurl=src_baseurl)

        if download:
            for f in workspace.mets.find_files():
                workspace.download_file(f)

        return workspace