def _fetch_url(self, url): """Download the given url and add the file to the collection. Args: url: The file to download as QUrl. """ if url.scheme() not in ['http', 'https']: return # Prevent loading an asset twice if url in self.loaded_urls: return self.loaded_urls.add(url) log.downloads.debug("loading asset at {}".format(url)) # Using the download manager to download host-blocked urls might crash # qute, see the comments/discussion on # https://github.com/qutebrowser/qutebrowser/pull/962#discussion_r40256987 # and https://github.com/qutebrowser/qutebrowser/issues/1053 host_blocker = objreg.get('host-blocker') if host_blocker.is_blocked(url): log.downloads.debug("Skipping {}, host-blocked".format(url)) # We still need an empty file in the output, QWebView can be pretty # picky about displaying a file correctly when not all assets are # at least referenced in the mhtml file. self.writer.add_file(urlutils.encoded_url(url), b'') return download_manager = objreg.get('qtnetwork-download-manager') target = downloads.FileObjDownloadTarget(_NoCloseBytesIO()) item = download_manager.get(url, target=target, auto_remove=True) self.pending_downloads.add((url, item)) item.finished.connect(functools.partial(self._finished, url, item)) item.error.connect(functools.partial(self._error, url, item)) item.cancelled.connect(functools.partial(self._cancelled, url, item))
def _fetch_url(self, url): """Download the given url and add the file to the collection. Args: url: The file to download as QUrl. """ if url.scheme() not in {"http", "https"}: return # Prevent loading an asset twice if url in self.loaded_urls: return self.loaded_urls.add(url) log.downloads.debug("loading asset at {}".format(url)) # Using the download manager to download host-blocked urls might crash # qute, see the comments/discussion on # https://github.com/The-Compiler/qutebrowser/pull/962#discussion_r40256987 # and https://github.com/The-Compiler/qutebrowser/issues/1053 host_blocker = objreg.get("host-blocker") if host_blocker.is_blocked(url): log.downloads.debug("Skipping {}, host-blocked".format(url)) # We still need an empty file in the output, QWebView can be pretty # picky about displaying a file correctly when not all assets are # at least referenced in the mhtml file. self.writer.add_file(urlutils.encoded_url(url), b"") return download_manager = objreg.get("download-manager", scope="window", window=self._win_id) item = download_manager.get(url, fileobj=_NoCloseBytesIO(), auto_remove=True) self.pending_downloads.add((url, item)) item.finished.connect(functools.partial(self._finished, url, item)) item.error.connect(functools.partial(self._error, url, item)) item.cancelled.connect(functools.partial(self._error, url, item))
def _finished(self, url, item): """Callback when a single asset is downloaded. Args: url: The original url of the asset as QUrl. item: The DownloadItem given by the DownloadManager """ if self.writer is None: raise AssertionError self.pending_downloads.remove((url, item)) mime = item.raw_headers.get(b'Content-Type', b'') # Note that this decoding always works and doesn't produce errors # RFC 7230 (https://tools.ietf.org/html/rfc7230) states: # Historically, HTTP has allowed field content with text in the # ISO-8859-1 charset [ISO-8859-1], supporting other charsets only # through use of [RFC2047] encoding. In practice, most HTTP header # field values use only a subset of the US-ASCII charset [USASCII]. # Newly defined header fields SHOULD limit their field values to # US-ASCII octets. A recipient SHOULD treat other octets in field # content (obs-text) as opaque data. mime = mime.decode('iso-8859-1') if mime.lower() == 'text/css' or url.fileName().endswith('.css'): # We can't always assume that CSS files are UTF-8, but CSS files # shouldn't contain many non-ASCII characters anyway (in most # cases). Using "ignore" lets us decode the file even if it's # invalid UTF-8 data. # The file written to the MHTML file won't be modified by this # decoding, since there we're taking the original bytestream. try: css_string = item.fileobj.getvalue().decode('utf-8') except UnicodeDecodeError: log.downloads.warning("Invalid UTF-8 data in {}".format(url)) css_string = item.fileobj.getvalue().decode('utf-8', 'ignore') import_urls = _get_css_imports(css_string) for import_url in import_urls: absolute_url = url.resolved(QUrl(import_url)) self._fetch_url(absolute_url) encode = E_QUOPRI if mime.startswith('text/') else E_BASE64 # Our MHTML handler refuses non-ASCII headers. This will replace every # non-ASCII char with '?'. This is probably okay, as official Content- # Type headers contain ASCII only anyway. Anything else is madness. mime = utils.force_encoding(mime, 'ascii') self.writer.add_file(urlutils.encoded_url(url), item.fileobj.getvalue(), mime, encode) item.fileobj.actual_close() if self.pending_downloads: return self._finish_file()
def _finished(self, url, item): """Callback when a single asset is downloaded. Args: url: The original url of the asset as QUrl. item: The DownloadItem given by the DownloadManager """ self.pending_downloads.remove((url, item)) mime = item.raw_headers.get(b'Content-Type', b'') # Note that this decoding always works and doesn't produce errors # RFC 7230 (https://tools.ietf.org/html/rfc7230) states: # Historically, HTTP has allowed field content with text in the # ISO-8859-1 charset [ISO-8859-1], supporting other charsets only # through use of [RFC2047] encoding. In practice, most HTTP header # field values use only a subset of the US-ASCII charset [USASCII]. # Newly defined header fields SHOULD limit their field values to # US-ASCII octets. A recipient SHOULD treat other octets in field # content (obs-text) as opaque data. mime = mime.decode('iso-8859-1') if mime.lower() == 'text/css' or url.fileName().endswith('.css'): # We can't always assume that CSS files are UTF-8, but CSS files # shouldn't contain many non-ASCII characters anyway (in most # cases). Using "ignore" lets us decode the file even if it's # invalid UTF-8 data. # The file written to the MHTML file won't be modified by this # decoding, since there we're taking the original bytestream. try: css_string = item.fileobj.getvalue().decode('utf-8') except UnicodeDecodeError: log.downloads.warning("Invalid UTF-8 data in {}".format(url)) css_string = item.fileobj.getvalue().decode('utf-8', 'ignore') import_urls = _get_css_imports(css_string) for import_url in import_urls: absolute_url = url.resolved(QUrl(import_url)) self._fetch_url(absolute_url) encode = E_QUOPRI if mime.startswith('text/') else E_BASE64 # Our MHTML handler refuses non-ASCII headers. This will replace every # non-ASCII char with '?'. This is probably okay, as official Content- # Type headers contain ASCII only anyway. Anything else is madness. mime = utils.force_encoding(mime, 'ascii') self.writer.add_file(urlutils.encoded_url(url), item.fileobj.getvalue(), mime, encode) item.fileobj.actual_close() if self.pending_downloads: return self._finish_file()
def _error(self, url, item, *_args): """Callback when a download error occurred. Args: url: The original url of the asset as QUrl. item: The DownloadItem given by the DownloadManager. """ try: self.pending_downloads.remove((url, item)) except KeyError: # This might happen if .collect_zombies() calls .finished() and the # error handler will be called after .collect_zombies log.downloads.debug("Oops! Download already gone: {}".format(item)) return item.fileobj.actual_close() # Add a stub file, see comment in .fetch_url() for more information self.writer.add_file(urlutils.encoded_url(url), b'') if self.pending_downloads: return self._finish_file()
def _error(self, url, item, *_args): """Callback when a download error occurred. Args: url: The orignal url of the asset as QUrl. item: The DownloadItem given by the DownloadManager. """ try: self.pending_downloads.remove((url, item)) except KeyError: # This might happen if .collect_zombies() calls .finished() and the # error handler will be called after .collect_zombies log.downloads.debug("Oops! Download already gone: {}".format(item)) return item.fileobj.actual_close() # Add a stub file, see comment in .fetch_url() for more information self.writer.add_file(urlutils.encoded_url(url), b'') if self.pending_downloads: return self._finish_file()
def _fetch_url(self, url): """Download the given url and add the file to the collection. Args: url: The file to download as QUrl. """ if url.scheme() not in ['http', 'https']: return # Prevent loading an asset twice if url in self.loaded_urls: return self.loaded_urls.add(url) log.downloads.debug("loading asset at {}".format(url)) # Using the download manager to download host-blocked urls might crash # qute, see the comments/discussion on # https://github.com/qutebrowser/qutebrowser/pull/962#discussion_r40256987 # and https://github.com/qutebrowser/qutebrowser/issues/1053 request = interceptors.Request(first_party_url=None, request_url=url) interceptors.run(request) if request.is_blocked: log.downloads.debug("Skipping {}, host-blocked".format(url)) # We still need an empty file in the output, QWebView can be pretty # picky about displaying a file correctly when not all assets are # at least referenced in the mhtml file. self.writer.add_file(urlutils.encoded_url(url), b'') return download_manager = objreg.get('qtnetwork-download-manager') target = downloads.FileObjDownloadTarget(_NoCloseBytesIO()) item = download_manager.get(url, target=target, auto_remove=True) self.pending_downloads.add((url, item)) item.finished.connect(functools.partial(self._finished, url, item)) item.error.connect(functools.partial(self._error, url, item)) item.cancelled.connect(functools.partial(self._cancelled, url, item))
def test_encoded_url(url, expected): """Test encoded_url""" url = QUrl(url) assert urlutils.encoded_url(url) == expected
def run(self): """Download and save the page. The object must not be reused, you should create a new one if you want to download another page. """ if self._used: raise ValueError("Downloader already used") self._used = True web_url = self.web_view.url() web_frame = self.web_view.page().mainFrame() self.writer = MHTMLWriter( web_frame.toHtml().encode('utf-8'), content_location=urlutils.encoded_url(web_url), # I've found no way of getting the content type of a QWebView, but # since we're using .toHtml, it's probably safe to say that the # content-type is HTML content_type='text/html; charset="UTF-8"', ) # Currently only downloading <link> (stylesheets), <script> # (javascript) and <img> (image) elements. elements = web_frame.findAllElements('link, script, img') for element in elements: element = webelem.WebElementWrapper(element) # Websites are free to set whatever rel=... attribute they want. # We just care about stylesheets and icons. if not _check_rel(element): continue if 'src' in element: element_url = element['src'] elif 'href' in element: element_url = element['href'] else: # Might be a local <script> tag or something else continue absolute_url = web_url.resolved(QUrl(element_url)) self._fetch_url(absolute_url) styles = web_frame.findAllElements('style') for style in styles: style = webelem.WebElementWrapper(style) # The Mozilla Developer Network says: # type: This attribute defines the styling language as a MIME type # (charset should not be specified). This attribute is optional and # default to text/css if it's missing. # https://developer.mozilla.org/en/docs/Web/HTML/Element/style if 'type' in style and style['type'] != 'text/css': continue for element_url in _get_css_imports(str(style)): self._fetch_url(web_url.resolved(QUrl(element_url))) # Search for references in inline styles for element in web_frame.findAllElements('[style]'): element = webelem.WebElementWrapper(element) style = element['style'] for element_url in _get_css_imports(style, inline=True): self._fetch_url(web_url.resolved(QUrl(element_url))) # Shortcut if no assets need to be downloaded, otherwise the file would # never be saved. Also might happen if the downloads are fast enough to # complete before connecting their finished signal. self._collect_zombies() if not self.pending_downloads and not self._finished_file: self._finish_file()
def run(self): """Download and save the page. The object must not be reused, you should create a new one if you want to download another page. """ if self._used: raise ValueError("Downloader already used") self._used = True web_url = self.tab.url() # FIXME:qtwebengine have a proper API for this page = self.tab._widget.page() # pylint: disable=protected-access web_frame = page.mainFrame() self.writer = MHTMLWriter( web_frame.toHtml().encode('utf-8'), content_location=urlutils.encoded_url(web_url), # I've found no way of getting the content type of a QWebView, but # since we're using .toHtml, it's probably safe to say that the # content-type is HTML content_type='text/html; charset="UTF-8"', ) # Currently only downloading <link> (stylesheets), <script> # (javascript) and <img> (image) elements. elements = web_frame.findAllElements('link, script, img') for element in elements: element = webkitelem.WebKitElement(element, tab=self.tab) # Websites are free to set whatever rel=... attribute they want. # We just care about stylesheets and icons. if not _check_rel(element): continue if 'src' in element: element_url = element['src'] elif 'href' in element: element_url = element['href'] else: # Might be a local <script> tag or something else continue absolute_url = web_url.resolved(QUrl(element_url)) self._fetch_url(absolute_url) styles = web_frame.findAllElements('style') for style in styles: style = webkitelem.WebKitElement(style, tab=self.tab) # The Mozilla Developer Network says: # > type: This attribute defines the styling language as a MIME # > type (charset should not be specified). This attribute is # > optional and default to text/css if it's missing. # https://developer.mozilla.org/en/docs/Web/HTML/Element/style if 'type' in style and style['type'] != 'text/css': continue for element_url in _get_css_imports(str(style)): self._fetch_url(web_url.resolved(QUrl(element_url))) # Search for references in inline styles for element in web_frame.findAllElements('[style]'): element = webkitelem.WebKitElement(element, tab=self.tab) style = element['style'] for element_url in _get_css_imports(style, inline=True): self._fetch_url(web_url.resolved(QUrl(element_url))) # Shortcut if no assets need to be downloaded, otherwise the file would # never be saved. Also might happen if the downloads are fast enough to # complete before connecting their finished signal. self._collect_zombies() if not self.pending_downloads and not self._finished_file: self._finish_file()
def test_encoded_url(url, expected): url = QUrl(url) assert urlutils.encoded_url(url) == expected