Пример #1
0
 def download_file(self, f, _recursion_count=0):
     """
     Download a :py:mod:`ocrd.model.ocrd_file.OcrdFile` to the workspace.
     """
     log = getLogger('ocrd.workspace.download_file')
     log.debug('download_file %s [_recursion_count=%s]' % (f, _recursion_count))
     with pushd_popd(self.directory):
         try:
             # If the f.url is already a file path, and is within self.directory, do nothing
             url_path = Path(f.url).resolve()
             if not (url_path.exists() and url_path.relative_to(str(Path(self.directory).resolve()))):
                 raise Exception("Not already downloaded, moving on")
         except Exception as e:
             basename = '%s%s' % (f.ID, MIME_TO_EXT.get(f.mimetype, '')) if f.ID else f.basename
             try:
                 f.url = self.resolver.download_to_directory(self.directory, f.url, subdir=f.fileGrp, basename=basename)
             except FileNotFoundError as e:
                 if not self.baseurl:
                     raise Exception("No baseurl defined by workspace. Cannot retrieve '%s'" % f.url)
                 if _recursion_count >= 1:
                     raise Exception("Already tried prepending baseurl '%s'. Cannot retrieve '%s'" % (self.baseurl, f.url))
                 log.debug("First run of resolver.download_to_directory(%s) failed, try prepending baseurl '%s': %s", f.url, self.baseurl, e)
                 f.url = '%s/%s' % (self.baseurl, f.url)
                 f.url = self.download_file(f, _recursion_count + 1).local_filename
         f.local_filename = f.url
         return f
Пример #2
0
 def download_file(self, f, _recursion_count=0):
     """
     Download a :py:mod:`ocrd.model.ocrd_file.OcrdFile` to the workspace.
     """
     log.debug('download_file %s [_recursion_count=%s]' %
               (f, _recursion_count))
     with pushd_popd(self.directory):
         # XXX FIXME hacky
         basename = '%s%s' % (f.ID, MIME_TO_EXT.get(
             f.mimetype, '')) if f.ID else f.basename
         try:
             f.url = self.resolver.download_to_directory(self.directory,
                                                         f.url,
                                                         subdir=f.fileGrp,
                                                         basename=basename)
         except FileNotFoundError as e:
             if not self.baseurl:
                 raise Exception(
                     "No baseurl defined by workspace. Cannot retrieve '%s'"
                     % f.url)
             if _recursion_count >= 1:
                 raise Exception(
                     "Already tried prepending baseurl '%s'. Cannot retrieve '%s'"
                     % (self.baseurl, f.url))
             log.debug(
                 "First run of resolver.download_to_directory(%s) failed, try prepending baseurl '%s': %s",
                 f.url, self.baseurl, e)
             f.url = '%s/%s' % (self.baseurl, f.url)
             f.url = self.download_file(f,
                                        _recursion_count + 1).local_filename
         # XXX FIXME HACK
         f.local_filename = f.url
         return f
Пример #3
0
 def process(self):
     LOG = getLogger('ocrd.dummy')
     assert_file_grp_cardinality(self.input_file_grp, 1)
     assert_file_grp_cardinality(self.output_file_grp, 1)
     for input_file in self.input_files:
         input_file = self.workspace.download_file(input_file)
         file_id = make_file_id(input_file, self.output_file_grp)
         ext = MIME_TO_EXT.get(input_file.mimetype, '')
         local_filename = join(self.output_file_grp, file_id + ext)
         pcgts = page_from_file(self.workspace.download_file(input_file))
         pcgts.set_pcGtsId(file_id)
         self.add_metadata(pcgts)
         LOG.info("cp %s %s # %s -> %s", input_file.url, local_filename, input_file.ID, file_id)
         if input_file.mimetype == MIMETYPE_PAGE:
             # Source file is PAGE-XML: Write out in-memory PcGtsType
             self.workspace.add_file(
                 ID=file_id,
                 file_grp=self.output_file_grp,
                 pageId=input_file.pageId,
                 mimetype=input_file.mimetype,
                 local_filename=local_filename,
                 content=to_xml(pcgts).encode('utf-8'))
         else:
             # Source file is not PAGE-XML: Copy byte-by-byte
             with open(input_file.local_filename, 'rb') as f:
                 content = f.read()
                 self.workspace.add_file(
                     ID=file_id,
                     file_grp=self.output_file_grp,
                     pageId=input_file.pageId,
                     mimetype=input_file.mimetype,
                     local_filename=local_filename,
                     content=content)
             if input_file.mimetype.startswith('image/'):
                 # write out the PAGE-XML representation for this image
                 page_file_id = file_id + '_PAGE'
                 pcgts.set_pcGtsId(page_file_id)
                 pcgts.get_Page().set_imageFilename(local_filename)
                 page_filename = join(self.output_file_grp, file_id + '.xml')
                 LOG.info("Add PAGE-XML %s generated for %s at %s",
                          page_file_id, file_id, page_filename)
                 self.workspace.add_file(
                     ID=page_file_id,
                     file_grp=self.output_file_grp,
                     pageId=input_file.pageId,
                     mimetype=MIMETYPE_PAGE,
                     local_filename=page_filename,
                     content=to_xml(pcgts).encode('utf-8'))