def urlretrieve(url: str, filename: str, context: ssl.SSLContext, reporthook=None, cookies_path=None): """ original source: https://github.com/python/cpython/blob/ 21bee0bd71e1ad270274499f9f58194ebb52e236/Lib/urllib/request.py#L229 Because urlopen also supports context, I decided to adapt the download function. """ url_parsed = urlparse.urlparse(url) request = urllib.request.Request(url=url, headers=RequestHelper.stdHeader) if cookies_path is not None: cookie_jar = MozillaCookieJar(cookies_path) cookie_jar.load(ignore_discard=True, ignore_expires=True) cookie_jar.add_cookie_header(request) with contextlib.closing(urllib.request.urlopen(request, context=context)) as fp: headers = fp.info() # Just return the local path and the 'headers' for file:// # URLs. No sense in performing a copy unless requested. if url_parsed.scheme == 'file' and not filename: return os.path.normpath(url_parsed.path), headers if not filename: raise RuntimeError('No filename specified!') tfp = open(filename, 'wb') with tfp: result = filename, headers # read overall read = 0 # 4kb at once bs = 1024 * 8 blocknum = 0 # guess size size = int(headers.get('Content-Length', -1)) if reporthook: reporthook(blocknum, bs, size) while True: block = fp.read(bs) if not block: break read += len(block) tfp.write(block) blocknum += 1 if reporthook: reporthook(blocknum, bs, size) if size >= 0 and read < size: raise ContentTooShortError('retrieval incomplete: got only %i out of %i bytes' % (read, size), result) return result
def retrieve(self, url, filename, reporthook=None, data=None): fp = self.opener.open(url, data) try: headers = fp.info() tfp = open(filename, 'wb') try: result = filename, headers bs = 1024 * 8 size = -1 read = 0 blocknum = 0 if reporthook: if "content-length" in headers: size = int(headers["Content-Length"]) reporthook(blocknum, bs, size) while 1: block = fp.read(bs) if not block: break read += len(block) tfp.write(block) blocknum += 1 if reporthook: reporthook(blocknum, bs, size) finally: tfp.close() finally: fp.close() # raise exception if actual size does not match content-length header if size >= 0 and read < size: raise ContentTooShortError( _("retrieval incomplete: got only %i out of %i bytes") % (read, size), result) return result
def retrieve(self, url, filename=None, filestream=None, reporthook=None, data=None): # return filename, headers (in dict), initial file bytes (to detect logon requests) headers = None initialBytes = b'' fp = self.opener.open(url, data, timeout=self.timeout) try: headers = fp.info() if filename: tfp = open(filename, 'wb') elif filestream: tfp = filestream try: result = filename, headers bs = 1024 * 8 size = -1 read = 0 blocknum = 0 if reporthook: if "content-length" in headers: size = int(headers["Content-Length"]) reporthook(blocknum, bs, size) isGzipped = "gzip" in headers.get("content-encoding", "") if isGzipped: decompressor = zlib.decompressobj( 16 + zlib.MAX_WBITS ) #this magic number can be inferred from the structure of a gzip file while 1: block = fp.read(bs) if not block: break if isGzipped: block = decompressor.decompress(block) read += len(block) tfp.write(block) if blocknum == 0: initialBytes = block blocknum += 1 if reporthook: reporthook(blocknum, bs, size) finally: if filename: tfp.close() finally: if fp: fp.close() # raise exception if actual size does not match content-length header if size >= 0 and read < size: raise ContentTooShortError( _("retrieval incomplete: got only %i out of %i bytes") % (read, size), result) if filestream: tfp.seek(0) return filename, headers, initialBytes
def retrieve(self, url, filename=None, filestream=None, reporthook=None, data=None): # return filename, headers (in dict), initial file bytes (to detect logon requests) headers = None initialBytes = b'' fp = self.opener.open(url, data, timeout=self.timeout) try: headers = fp.info() if filename: tfp = open(filename, 'wb') elif filestream: tfp = filestream try: result = filename, headers bs = 1024 * 8 size = -1 read = 0 blocknum = 0 if reporthook: if "content-length" in headers: size = int(headers["Content-Length"]) reporthook(blocknum, bs, size) while 1: block = fp.read(bs) if not block: break read += len(block) tfp.write(block) if blocknum == 0: initialBytes = block blocknum += 1 if reporthook: reporthook(blocknum, bs, size) finally: if filename: tfp.close() finally: if fp: fp.close() # raise exception if actual size does not match content-length header if size >= 0 and read < size: raise ContentTooShortError( _("retrieval incomplete: got only %i out of %i bytes") % (read, size), result) if filestream: tfp.seek(0) return filename, headers, initialBytes
def retrieve_file(url: URL) -> Tuple[TextIO, str]: """ Fetch a file from a URL (handling SSL). This is based off `urllib.request.urlretrieve`. """ if url.scheme not in ("http", "https"): raise InvalidConfiguration("Illegal scheme.") context = (ssl.SSLContext( protocol=ssl.PROTOCOL_TLSv1_2) if url.scheme == "https" else None) with contextlib.closing( urlopen(url, context=context) # nosec - There is a check above for SSL ) as response: block_size = 1024 * 8 size = -1 read = 0 headers = response.info() if "Content-Length" in headers: size = int(headers["Content-Length"]) if "Content-Type" in headers: content_type = headers["Content-Type"] else: content_type = content_type_from_url(url) tfp = tempfile.TemporaryFile() while True: block = response.read(block_size) if not block: break read += len(block) tfp.write(block) # Seek to start tfp.seek(0) if size >= 0 and read < size: tfp.close() raise ContentTooShortError( f"retrieval incomplete: got only {read} out of {size} bytes", headers) return tfp, content_type
def url_retrieve_with_headers(url, filename=None, headers=None, reporthook=None): url_type, path = urllib.parse.splittype(url) opener = urllib.request.build_opener() if headers: opener.addheaders = list(headers.items()) with contextlib.closing(opener.open(url)) as fp: headers = fp.info() # Just return the local path and the "headers" for file:// # URLs. No sense in performing a copy unless requested. if url_type == "file" and not filename: return os.path.normpath(path), headers tfp = open(filename, 'wb') with tfp: result = filename, headers bs = 1024*8 size = -1 read = 0 blocknum = 0 if "content-length" in headers: size = int(headers["Content-Length"]) if reporthook: reporthook(blocknum, bs, size) while True: block = fp.read(bs) if not block: break read += len(block) tfp.write(block) blocknum += 1 if reporthook: reporthook(blocknum, bs, size) if size >= 0 and read < size: raise ContentTooShortError( "retrieval incomplete: got only %i out of %i bytes" % (read, size), result) return result
def fetch_binary(src, dst, name): tarball = os.path.join(dst, os.path.basename(src)) target = os.path.join(dst, f'{name}') try: os.stat(target) # short circuit to avoid redownloading tarballs except FileNotFoundError: pass else: return target try: urlretrieve(url=src, filename=tarball, reporthook=show_progress) except ContentTooShortError as e: raise ContentTooShortError(f"failed to download {src}: {e}") with tarfile.open(tarball) as tar: tar.extract(member=name, path=dst) tar.close() return target
def download_func(url, filename=None, ip=None, headers=None, reporthook=None, data=None): """参考于urllib.request.urlretrieve函数,进行了一定的改进 1.可以添加ip代理与headers 2.可以显示制定的进度条 参数: url:下载文件的链接 filename:下载文件路径名称 ip:代理ip 格式为字符串 【ip:port】 headers:添加头文件,默认为空 reporthook:是否显示进度条 True False data:向服务器发送数据 返回值: 下载文件相关信息 """ url_type, path = splittype(url) req_obj = Request(url) # 添加headers if headers: def addheaders(headers, req_obj): for key, value in headers.items(): req_obj.add_header(key, value) # 添加头文件 return req_obj req = addheaders(headers, req_obj) # 添加ip代理 if ip: proxies = {"http": ip, "https": ip, "socks": ip} proxy_support = ProxyHandler(proxies) opener = build_opener(proxy_support) install_opener(opener) # 利用urlopen进行下载 with contextlib.closing(urlopen(req, data)) as fp: headers = fp.info() # 判断传入的url是http网址还是本地文件地址 if url_type == "file" and not filename: return os.path.normpath(path), headers # 如果为本地地址则open创建新文件 # 否则创建临时文件 if filename: tfp = open(filename, 'wb') else: tfp = tempfile.NamedTemporaryFile(delete=False) filename = tfp.name url_tempfile.append(filename) with tfp: result = filename, headers bs = 1024 * 20 # 每块的大小 size = -1 # 总大小 downsize = 0 # 已下载的大小 if "content-length" in headers: size = int(headers["Content-Length"]) else: headers = { 'Accept-Encoding': 'None' } # 有的返回headers中不存在Conten-Length, 令其为none new_req = addheaders(headers, req_obj) size = int(urlopen(new_req, data).info()["Content-Length"]) if reporthook: start_time = time.time() # 网速的起始时间 reporthook(downsize, size, start_time) # 下载文件循环,直到下载完成 while True: block = fp.read(bs) if not block: break downsize += len(block) tfp.write(block) if reporthook: reporthook(downsize, size, start_time) if size >= 0 and downsize < size: raise ContentTooShortError( "retrieval incomplete: got only %i out of %i bytes" % (downsize, size), result) return result
def _urlretrieve(url, fname=None, dir_prefix=".", headers=None, content_disposition=False, blocksize=1024 * 8, timeout=None, progressbar=True, reporthook=None, file_hash=None, hash_algorithm="auto", force_download=False): """ A more advance version of urllib.request.urlretrieve with support of progress bars, automatic file name, cache and file hash """ if headers is None: headers = {} dir_prefix = os.path.expanduser(dir_prefix) if fname is None and not content_disposition: fname = filename_from_url(url) # Check if file already exists before doing any request if fname is not None and os.path.exists(os.path.join( dir_prefix, fname)) and not force_download: if file_hash is not None and not validate_file( os.path.join(dir_prefix, fname), file_hash, hash_algorithm): _warn_about_different_hash(file_hash, hash_algorithm) else: return os.path.join(dir_prefix, fname) request = Request(url, headers=headers) with urlopen(request, timeout=timeout) as response: headers = response.info() if callable(fname): fname = fname(response) if fname is None: fname = headers.get_filename() if fname is None: fname = filename_from_url(url) if os.path.isabs(fname): file_path = fname else: os.makedirs(dir_prefix, exist_ok=True) file_path = os.path.join(dir_prefix, fname) _, extension = splitext(fname) if not extension: extension = guess_extension(headers.get_content_type() or "") if extension is not None: file_path += extension if os.path.exists(file_path) and not force_download: if file_hash is not None and not validate_file( file_path, file_hash, hash_algorithm): _warn_about_different_hash(file_hash, hash_algorithm) else: return file_path content_length = int(headers.get("Content-Length", -1)) blocknum = 0 bytes_read = 0 download_file_path = file_path + ".download" with open(download_file_path, "wb") as fp, tqdm(total=content_length, unit='B', unit_scale=True, miniters=1, unit_divisor=1024, desc="Downloading {}...".format(fname), disable=not progressbar) as pbar: while True: block = response.read(blocksize) if not block: break fp.write(block) blocknum += 1 bytes_read += len(block) if pbar is not None: pbar.update(blocksize) if reporthook is not None: reporthook(blocknum, blocksize, content_length) if content_length >= 0 and bytes_read < content_length: error_msg = "retrieval incomplete: got only {} out of {} bytes".format( bytes_read, content_length) raise ContentTooShortError(error_msg, (download_file_path, headers)) os.rename(download_file_path, file_path) return file_path
def http_open(self, req): """Provide http_open to raise exception.""" raise ContentTooShortError("Expected 1000 bytes", CALENDAR_DATA)