def put(self, url, filename=None, content=None): cached_filename = os.path.join(self.directory, safe_filename(url)) if filename is None and content is None: raise Exception("cache.put requires 'filename' or 'content' kwarg") elif filename: with open(filename, 'rb') as f: content = f.read() with open(cached_filename, 'wb') as outfile: outfile.write(content) return cached_filename
def download_to_directory(self, directory, url, basename=None, overwrite=False, subdir=None): """ Download a file to the workspace. Early Shortcut: If url is a file://-URL and that file is already in the directory, keep it there. If basename is not given but subdir is, assume user knows what she's doing and use last URL segment as the basename. If basename is not given and no subdir is given, use the alnum characters in the URL as the basename. Args: directory (string): Directory to download files to basename (string, None): basename part of the filename on disk. url (string): URL to download from overwrite (boolean): Whether to overwrite existing files with that name subdir (boolean, None): Subdirectory to create within the directory. Think fileGrp. Returns: Local filename """ log = getLogger('ocrd.resolver.download_to_directory') # pylint: disable=redefined-outer-name log.debug("directory=|%s| url=|%s| basename=|%s| overwrite=|%s| subdir=|%s|", directory, url, basename, overwrite, subdir) if basename is None: if (subdir is not None) or \ (directory and url.startswith('file://%s' % directory)): # in case downloading a url 'file:///tmp/foo/bar' to directory '/tmp/foo' basename = url.rsplit('/', 1)[-1] else: basename = safe_filename(url) if subdir is not None: basename = os.path.join(subdir, basename) outfilename = os.path.join(directory, basename) if os.path.exists(outfilename) and not overwrite: log.debug("File already exists and overwrite=False: %s", outfilename) return outfilename outfiledir = outfilename.rsplit('/', 1)[0] # print(outfiledir) if not os.path.isdir(outfiledir): os.makedirs(outfiledir) log.debug("Downloading <%s> to '%s'", url, outfilename) if url.startswith('file://'): copyfile(url[len('file://'):], outfilename) else: response = requests.get(url) if response.status_code != 200: raise Exception("Not found: %s (HTTP %d)" % (url, response.status_code)) with open(outfilename, 'wb') as outfile: outfile.write(response.content) return outfilename
def download_to_directory(self, directory, url, basename=None, overwrite=False, subdir=None, prefer_symlink=None): """ Download a file to the workspace. If basename is not given but subdir is, assume user knows what she's doing and use last URL segment as the basename. If basename is not given and no subdir is given, use the alnum characters in the URL as the basename. Args: directory (string): Directory to download files to basename (string, None): basename part of the filename on disk. url (string): URL to download from overwrite (boolean): Whether to overwrite existing files with that name subdir (boolean, None): Subdirectory to create within the directory. Think fileGrp. prefer_symlink (boolean): Whether to use symlinks instead of copying. Overrides self.prefer_symlink Returns: Local filename """ log = getLogger('ocrd.resolver.download_to_directory') # pylint: disable=redefined-outer-name if basename is None: if subdir is not None: basename = url.rsplit('/', 1)[-1] else: basename = safe_filename(url) if subdir is not None: basename = os.path.join(subdir, basename) outfilename = os.path.join(directory, basename) if os.path.exists(outfilename) and not overwrite: log.debug("File already exists and overwrite=False: %s", outfilename) return outfilename outfiledir = outfilename.rsplit('/', 1)[0] # print(outfiledir) if not os.path.isdir(outfiledir): os.makedirs(outfiledir) cached_filename = self.cache.get(url) if self.cache_enabled else False if cached_filename: log.debug("Found cached version of <%s> at '%s'", url, cached_filename) self._copy_or_symlink(cached_filename, outfilename, prefer_symlink) else: log.debug("Downloading <%s> to '%s'", url, outfilename) if url.startswith('file://'): self._copy_or_symlink(url[len('file://'):], outfilename, prefer_symlink) else: with open(outfilename, 'wb') as outfile: response = requests.get(url) if response.status_code != 200: raise Exception("Not found: %s (HTTP %d)" % (url, response.status_code)) outfile.write(response.content) if self.cache_enabled and not cached_filename: cached_filename = self.cache.put(url, filename=outfilename) log.debug("Stored in cache <%s> at '%s'", url, cached_filename) return outfilename
def get(self, url): cached_filename = os.path.join(self.directory, safe_filename(url)) if os.path.exists(cached_filename): return cached_filename