def get(self, url, *args, **kwargs): res = super().get(url, *args, **kwargs) if res.status_code == 420 and 'warcprox-meta' in res.headers: raise brozzler.ReachedLimit(warcprox_meta=json.loads( res.headers['warcprox-meta']), http_payload=res.text) else: return res
def _network_response_received(self, message): if (not self._reached_limit and message["params"]["response"]["status"] == 420 and "Warcprox-Meta" in CaseInsensitiveDict( message["params"]["response"]["headers"])): warcprox_meta = json.loads( CaseInsensitiveDict( message["params"]["response"]["headers"])["Warcprox-Meta"]) self._reached_limit = brozzler.ReachedLimit( warcprox_meta=warcprox_meta) self.logger.info("reached limit %s", self._reached_limit) if self.on_response: self.on_response(message)
def _try_youtube_dl(self, ydl, site, page): try: self.logger.info("trying youtube-dl on {}".format(page)) with brozzler.thread_accept_exceptions(): # we do whatwg canonicalization here to avoid "<urlopen error # no host given>" resulting in ProxyError # needs automated test info = ydl.extract_info(str(urlcanon.whatwg(page.url))) self._remember_videos(page, ydl.brozzler_spy) # logging.info('XXX %s', json.dumps(info)) if self._using_warcprox(site): info_json = json.dumps(info, sort_keys=True, indent=4) self.logger.info( "sending WARCPROX_WRITE_RECORD request to warcprox " "with youtube-dl json for %s", page) self._warcprox_write_record( warcprox_address=self._proxy_for(site), url="youtube-dl:%s" % str(urlcanon.semantic(page.url)), warc_type="metadata", content_type= "application/vnd.youtube-dl_formats+json;charset=utf-8", payload=info_json.encode("utf-8"), extra_headers=site.extra_headers()) except brozzler.ShutdownRequested as e: raise except BaseException as e: if hasattr( e, "exc_info" ) and e.exc_info[0] == youtube_dl.utils.UnsupportedError: pass elif (hasattr(e, "exc_info") and e.exc_info[0] == urllib.error.HTTPError and hasattr(e.exc_info[1], "code") and e.exc_info[1].code == 420): raise brozzler.ReachedLimit(e.exc_info[1]) elif (hasattr(e, 'exc_info') and e.exc_info[0] == urllib.error.URLError and self._proxy_for(site)): # connection problem when using a proxy == proxy error (XXX?) raise brozzler.ProxyError( 'youtube-dl hit apparent proxy error from ' '%s' % page.url) from e else: raise
def _network_response_received(self, message): if (message['params']['response']['status'] == 420 and 'Warcprox-Meta' in CaseInsensitiveDict( message['params']['response']['headers'])): if not self.reached_limit: warcprox_meta = json.loads(CaseInsensitiveDict( message['params']['response']['headers'])['Warcprox-Meta']) self.reached_limit = brozzler.ReachedLimit( warcprox_meta=warcprox_meta) self.logger.info('reached limit %s', self.reached_limit) brozzler.thread_raise( self.calling_thread, brozzler.ReachedLimit) else: self.logger.info( 'reached limit but self.reached_limit is already set, ' 'assuming the calling thread is already handling this') if self.on_response: self.on_response(message)
def _try_youtube_dl(worker, ydl, site, page): try: logging.info("trying yt-dlp on %s", page) with brozzler.thread_accept_exceptions(): # we do whatwg canonicalization here to avoid "<urlopen error # no host given>" resulting in ProxyError # needs automated test # and yt-dlp needs sanitize_info for extract_info ie_result = ydl.sanitize_info(ydl.extract_info(str(urlcanon.whatwg(page.url)))) _remember_videos(page, ydl.fetch_spy.fetches, ydl.stitch_ups) if worker._using_warcprox(site): info_json = json.dumps(ie_result, sort_keys=True, indent=4) logging.info( "sending WARCPROX_WRITE_RECORD request to warcprox " "with yt-dlp json for %s", page) worker._warcprox_write_record( warcprox_address=worker._proxy_for(site), url="youtube-dl:%s" % str(urlcanon.semantic(page.url)), warc_type="metadata", content_type="application/vnd.youtube-dl_formats+json;charset=utf-8", payload=info_json.encode("utf-8"), extra_headers=site.extra_headers(page)) return ie_result except brozzler.ShutdownRequested as e: raise except Exception as e: if hasattr(e, "exc_info") and e.exc_info[0] == youtube_dl.utils.UnsupportedError: return None elif (hasattr(e, "exc_info") and e.exc_info[0] == urllib.error.HTTPError and hasattr(e.exc_info[1], "code") and e.exc_info[1].code == 420): raise brozzler.ReachedLimit(e.exc_info[1]) elif (hasattr(e, 'exc_info') and e.exc_info[0] == urllib.error.URLError and worker._proxy_for(site)): # connection problem when using a proxy == proxy error (XXX?) raise brozzler.ProxyError( 'yt-dlp hit apparent proxy error from ' '%s' % page.url) from e else: raise