def _do_request(self, method, load_url, data, req_headers, params, is_live): adapter = DefaultAdapters.live_adapter if is_live else DefaultAdapters.remote_adapter max_retries = adapter.max_retries if SOCKS_PROXIES: conn = adapter.get_connection(load_url, SOCKS_PROXIES) else: conn = adapter.poolmanager try: upstream_res = conn.urlopen(method=method, url=load_url, body=data, headers=req_headers, redirect=False, assert_same_host=False, preload_content=False, decode_content=False, retries=max_retries, timeout=params.get('_timeout')) return upstream_res except Exception as e: if logger.isEnabledFor(logging.DEBUG): import traceback traceback.print_exc() logger.debug('FAILED: ' + method + ' ' + load_url + ': ' + str(e)) raise LiveResourceException(load_url)
def raise_on_self_redirect(self, params, cdx, status_code, location_url): """ Check if response is a 3xx redirect to the same url If so, reject this capture to avoid causing redirect loop """ if cdx.get('is_live'): return if not status_code.startswith('3') or status_code == '304': return request_url = params['url'].lower() if not location_url: return location_url = location_url.lower() if location_url.startswith('/'): host = urlsplit(cdx['url']).netloc location_url = host + location_url location_url = location_url.split('://', 1)[-1].rstrip('/') request_url = request_url.split('://', 1)[-1].rstrip('/') if request_url == location_url: msg = 'Self Redirect {0} -> {1}' msg = msg.format(request_url, location_url) raise LiveResourceException(msg)
def handle_request(self, wbrequest): if wbrequest.wb_url.is_query(): type_ = wbrequest.wb_url.LATEST_REPLAY url = wbrequest.urlrewriter.get_new_url(type=type_, timestamp='') return WbResponse.redir_response(url) try: return self.render_content(wbrequest) except Exception as exc: import traceback err_details = traceback.format_exc() print(err_details) url = wbrequest.wb_url.url msg = 'Could not load the url from the live web: ' + url raise LiveResourceException(msg=msg, url=url)
def raise_on_self_redirect(self, params, cdx, status_code, location_url): """ Check if response is a 3xx redirect to the same url If so, reject this capture to avoid causing redirect loop """ if cdx.get('is_live'): return if not status_code.startswith('3') or status_code == '304': return request_url = params['url'].lower() if not location_url: return location_url = location_url.lower() if location_url.startswith('/'): host = urlsplit(cdx['url']).netloc location_url = host + location_url location_url = location_url.split('://', 1)[-1].rstrip('/') request_url = request_url.split('://', 1)[-1].rstrip('/') self_redir = False orig_key = params.get('sr-urlkey') or cdx['urlkey'] if request_url == location_url: self_redir = True # if new location canonicalized matches old key, also self-redirect elif canonicalize(location_url) == orig_key: self_redir = True if self_redir: msg = 'Self Redirect {0} -> {1}' msg = msg.format(request_url, location_url) params['sr-urlkey'] = orig_key raise LiveResourceException(msg)
def _do_request(self, method, load_url, data, req_headers, params, is_live): adapter = DefaultAdapters.live_adapter if is_live else DefaultAdapters.remote_adapter max_retries = adapter.max_retries # get either the poolmanager or proxy manager to handle this connection if self.socks_proxy and not os.environ.get('SOCKS_DISABLE'): manager = adapter.proxy_manager_for(self.socks_proxy) else: manager = adapter.poolmanager upstream_res = None try: upstream_res = manager.urlopen(method=method, url=load_url, body=data, headers=req_headers, redirect=False, assert_same_host=False, preload_content=False, decode_content=False, retries=max_retries, timeout=params.get('_timeout')) return upstream_res except Exception as e: if upstream_res: no_except_close(upstream_res) if logger.isEnabledFor(logging.DEBUG): import traceback traceback.print_exc() logger.debug('FAILED: ' + method + ' ' + load_url + ': ' + str(e)) raise LiveResourceException(load_url)
def load_resource(self, cdx, params): load_url = cdx.get('load_url') if not load_url: return None if params.get('content_type') == VideoLoader.CONTENT_TYPE: return None if self.forward_proxy_prefix and not cdx.get('is_live'): load_url = self.forward_proxy_prefix + load_url input_req = params['_input_req'] req_headers = input_req.get_req_headers() dt = timestamp_to_datetime(cdx['timestamp']) if cdx.get('memento_url'): req_headers['Accept-Datetime'] = datetime_to_http_date(dt) method = input_req.get_req_method() data = input_req.get_req_body() p = PreparedRequest() try: p.prepare_url(load_url, None) except: raise LiveResourceException(load_url) p.prepare_headers(None) p.prepare_auth(None, load_url) auth = p.headers.get('Authorization') if auth: req_headers['Authorization'] = auth load_url = p.url # host is set to the actual host for live loading # ensure it is set to the load_url host if not cdx.get('is_live'): #req_headers.pop('Host', '') req_headers['Host'] = urlsplit(p.url).netloc referrer = cdx.get('set_referrer') if referrer: req_headers['Referer'] = referrer upstream_res = self._do_request_with_redir_check( method, load_url, data, req_headers, params, cdx) memento_dt = upstream_res.headers.get('Memento-Datetime') if memento_dt: dt = http_date_to_datetime(memento_dt) cdx['timestamp'] = datetime_to_timestamp(dt) elif cdx.get('memento_url'): # if 'memento_url' set and no Memento-Datetime header present # then its an error return None agg_type = upstream_res.headers.get('Warcserver-Type') if agg_type == 'warc': cdx['source'] = unquote( upstream_res.headers.get('Warcserver-Source-Coll')) return None, upstream_res.headers, upstream_res if upstream_res.version == 11: version = '1.1' else: version = '1.0' status = 'HTTP/{version} {status} {reason}\r\n' status = status.format(version=version, status=upstream_res.status, reason=upstream_res.reason) http_headers_buff = status orig_resp = upstream_res._original_response try: #pragma: no cover #PY 3 resp_headers = orig_resp.headers._headers for n, v in resp_headers: nl = n.lower() if nl in self.SKIP_HEADERS: continue if nl in self.UNREWRITE_HEADERS: v = self.unrewrite_header(cdx, v) http_headers_buff += n + ': ' + v + '\r\n' http_headers_buff += '\r\n' try: # http headers could be encoded as utf-8 (though non-standard) # first try utf-8 encoding http_headers_buff = http_headers_buff.encode('utf-8') except: # then, fall back to latin-1 http_headers_buff = http_headers_buff.encode('latin-1') except: #pragma: no cover #PY 2 resp_headers = orig_resp.msg.headers for line in resp_headers: n, v = line.split(':', 1) n = n.lower() v = v.strip() if n in self.SKIP_HEADERS: continue new_v = v if n in self.UNREWRITE_HEADERS: new_v = self.unrewrite_header(cdx, v) if new_v != v: http_headers_buff += n + ': ' + new_v + '\r\n' else: http_headers_buff += line # if python2, already byte headers, so leave as is http_headers_buff += '\r\n' try: fp = upstream_res._fp.fp if hasattr(fp, 'raw'): #pragma: no cover fp = fp.raw remote_ip = fp._sock.getpeername()[0] except: #pragma: no cover remote_ip = None warc_headers = {} warc_headers['WARC-Type'] = 'response' warc_headers['WARC-Record-ID'] = self._make_warc_id() warc_headers['WARC-Target-URI'] = cdx['url'] warc_headers['WARC-Date'] = datetime_to_iso_date(dt) if not cdx.get('is_live'): now = datetime.datetime.utcnow() warc_headers['WARC-Source-URI'] = cdx.get('load_url') warc_headers['WARC-Creation-Date'] = datetime_to_iso_date(now) if remote_ip: warc_headers['WARC-IP-Address'] = remote_ip ct = upstream_res.headers.get('Content-Type') if ct: metadata = self.get_custom_metadata(ct, dt) if metadata: warc_headers['WARC-JSON-Metadata'] = json.dumps(metadata) warc_headers['Content-Type'] = 'application/http; msgtype=response' if method == 'HEAD': content_len = 0 else: content_len = upstream_res.headers.get('Content-Length', -1) self._set_content_len(content_len, warc_headers, len(http_headers_buff)) warc_headers = StatusAndHeaders('WARC/1.0', warc_headers.items()) return (warc_headers, http_headers_buff, upstream_res)