def download_file(self, f, _recursion_count=0): """ Download a :py:mod:`ocrd.model.ocrd_file.OcrdFile` to the workspace. """ log = getLogger('ocrd.workspace.download_file') log.debug('download_file %s [_recursion_count=%s]' % (f, _recursion_count)) with pushd_popd(self.directory): try: # If the f.url is already a file path, and is within self.directory, do nothing url_path = Path(f.url).resolve() if not (url_path.exists() and url_path.relative_to(str(Path(self.directory).resolve()))): raise Exception("Not already downloaded, moving on") except Exception as e: basename = '%s%s' % (f.ID, MIME_TO_EXT.get(f.mimetype, '')) if f.ID else f.basename try: f.url = self.resolver.download_to_directory(self.directory, f.url, subdir=f.fileGrp, basename=basename) except FileNotFoundError as e: if not self.baseurl: raise Exception("No baseurl defined by workspace. Cannot retrieve '%s'" % f.url) if _recursion_count >= 1: raise Exception("Already tried prepending baseurl '%s'. Cannot retrieve '%s'" % (self.baseurl, f.url)) log.debug("First run of resolver.download_to_directory(%s) failed, try prepending baseurl '%s': %s", f.url, self.baseurl, e) f.url = '%s/%s' % (self.baseurl, f.url) f.url = self.download_file(f, _recursion_count + 1).local_filename f.local_filename = f.url return f
def download_file(self, f, _recursion_count=0): """ Download a :py:mod:`ocrd.model.ocrd_file.OcrdFile` to the workspace. """ log.debug('download_file %s [_recursion_count=%s]' % (f, _recursion_count)) with pushd_popd(self.directory): # XXX FIXME hacky basename = '%s%s' % (f.ID, MIME_TO_EXT.get( f.mimetype, '')) if f.ID else f.basename try: f.url = self.resolver.download_to_directory(self.directory, f.url, subdir=f.fileGrp, basename=basename) except FileNotFoundError as e: if not self.baseurl: raise Exception( "No baseurl defined by workspace. Cannot retrieve '%s'" % f.url) if _recursion_count >= 1: raise Exception( "Already tried prepending baseurl '%s'. Cannot retrieve '%s'" % (self.baseurl, f.url)) log.debug( "First run of resolver.download_to_directory(%s) failed, try prepending baseurl '%s': %s", f.url, self.baseurl, e) f.url = '%s/%s' % (self.baseurl, f.url) f.url = self.download_file(f, _recursion_count + 1).local_filename # XXX FIXME HACK f.local_filename = f.url return f
def process(self): LOG = getLogger('ocrd.dummy') assert_file_grp_cardinality(self.input_file_grp, 1) assert_file_grp_cardinality(self.output_file_grp, 1) for input_file in self.input_files: input_file = self.workspace.download_file(input_file) file_id = make_file_id(input_file, self.output_file_grp) ext = MIME_TO_EXT.get(input_file.mimetype, '') local_filename = join(self.output_file_grp, file_id + ext) pcgts = page_from_file(self.workspace.download_file(input_file)) pcgts.set_pcGtsId(file_id) self.add_metadata(pcgts) LOG.info("cp %s %s # %s -> %s", input_file.url, local_filename, input_file.ID, file_id) if input_file.mimetype == MIMETYPE_PAGE: # Source file is PAGE-XML: Write out in-memory PcGtsType self.workspace.add_file( ID=file_id, file_grp=self.output_file_grp, pageId=input_file.pageId, mimetype=input_file.mimetype, local_filename=local_filename, content=to_xml(pcgts).encode('utf-8')) else: # Source file is not PAGE-XML: Copy byte-by-byte with open(input_file.local_filename, 'rb') as f: content = f.read() self.workspace.add_file( ID=file_id, file_grp=self.output_file_grp, pageId=input_file.pageId, mimetype=input_file.mimetype, local_filename=local_filename, content=content) if input_file.mimetype.startswith('image/'): # write out the PAGE-XML representation for this image page_file_id = file_id + '_PAGE' pcgts.set_pcGtsId(page_file_id) pcgts.get_Page().set_imageFilename(local_filename) page_filename = join(self.output_file_grp, file_id + '.xml') LOG.info("Add PAGE-XML %s generated for %s at %s", page_file_id, file_id, page_filename) self.workspace.add_file( ID=page_file_id, file_grp=self.output_file_grp, pageId=input_file.pageId, mimetype=MIMETYPE_PAGE, local_filename=page_filename, content=to_xml(pcgts).encode('utf-8'))