def load_resource(self, cdx, params): load_url = cdx.get('load_url') if not load_url: return None if params.get('content_type') != self.CONTENT_TYPE: return None if not self.ydl: return None info = self.ydl.extract_info(load_url) info_buff = json.dumps(info) info_buff = info_buff.encode('utf-8') warc_headers = {} schema, rest = load_url.split('://', 1) target_url = 'metadata://' + rest dt = timestamp_to_datetime(cdx['timestamp']) warc_headers['WARC-Type'] = 'metadata' warc_headers['WARC-Record-ID'] = self._make_warc_id() warc_headers['WARC-Target-URI'] = target_url warc_headers['WARC-Date'] = datetime_to_iso_date(dt) warc_headers['Content-Type'] = self.CONTENT_TYPE warc_headers['Content-Length'] = str(len(info_buff)) warc_headers = StatusAndHeaders('WARC/1.0', warc_headers.items()) return warc_headers, None, BytesIO(info_buff)
def __init__(self, statusline, headers, removed_header_dict, text_type, charset): self.status_headers = StatusAndHeaders(statusline, headers) self.removed_header_dict = removed_header_dict self.text_type = text_type self.charset = charset
def _redirect_if_needed(self, wbrequest, cdx): if wbrequest.options['is_proxy']: return None redir_needed = (wbrequest.options.get('is_timegate', False)) if not redir_needed and self.redir_to_exact: redir_needed = (cdx['timestamp'] != wbrequest.wb_url.timestamp) if not redir_needed: return None new_url = (wbrequest.urlrewriter.get_new_url( timestamp=cdx['timestamp'], url=cdx['original'])) if wbrequest.method == 'POST': # FF shows a confirm dialog, so can't use 307 effectively # statusline = '307 Same-Method Internal Redirect' return None else: statusline = '302 Internal Redirect' status_headers = StatusAndHeaders(statusline, [('Location', new_url)]) # don't include cdx to indicate internal redirect return self.response_class(status_headers, wbrequest=wbrequest)
def bin_stream(stream, content_type, status='200 OK', headers=None): def_headers = [('Content-Type', content_type)] if headers: def_headers += headers status_headers = StatusAndHeaders(status, def_headers) return WbResponse(status_headers, value=stream)
def text_response(text, status='200 OK', content_type='text/plain; charset=utf-8'): status_headers = StatusAndHeaders(status, [('Content-Type', content_type), ('Content-Length', str(len(text)))]) return WbResponse(status_headers, value=[text.encode('utf-8')])
def get_top_frame_response(self, wbrequest): params = self.get_top_frame_params(wbrequest, mod=self.replay_mod) headers = [('Content-Type', 'text/html')] status_headers = StatusAndHeaders('200 OK', headers) template_result = self.frame_insert_view.render_to_string(**params) body = template_result.encode('utf-8') return self.response_class(status_headers, [body], wbrequest=wbrequest)
def select_coll_response(self, env, default_coll=None): proxy_msg = 'Basic realm="{0}"'.format(self.auth_msg) headers = [('Content-Type', 'text/plain'), ('Proxy-Authenticate', proxy_msg)] status_headers = StatusAndHeaders('407 Proxy Authentication', headers) value = self.auth_msg return WbResponse(status_headers, value=[value.encode('utf-8')])
def _test_proxy_headers(http_cache=None): headers = _make_cache_headers() status = '200 OK' rewriter = UrlRewriter('20131010/http://example.com/', '/pywb/', rewrite_opts={'http_cache': http_cache}) rewritten = headerrewriter.rewrite(StatusAndHeaders(status, headers), rewriter, rewriter.get_cookie_rewriter()) return rewritten.status_headers
def __call__(self, cdx, skip_hosts, cdx_loader, wbrequest): self.session.cookies.clear() try_urls, host, archive_name = self._get_urls_to_try( cdx, skip_hosts, wbrequest) try: response = self._do_req(try_urls, host, cdx, wbrequest.env, skip_hosts) except Exception as e: print(e) response = None if response is None: print(skip_hosts) raise CaptureException('Content Could Not Be Loaded') if response.status_code >= 300 and response.status_code < 400: self.unrewrite_header(response, 'Location') self.unrewrite_header(response, 'Content-Location') remote = wbrequest.env.get('REMOTE_ADDR') req_ts = wbrequest.wb_url.timestamp base_key = remote + ':' + req_ts sec = timestamp_to_sec(cdx['timestamp']) referrer = wbrequest.env.get('HTTP_REFERER') try: pi = redisclient.redis.pipeline(transaction=False) pi.hset(base_key + ':urls', cdx['url'], sec) pi.sadd(base_key + ':hosts', archive_name) if referrer and not referrer.endswith('.css'): pi.set(base_key + ':ref', referrer) elif not referrer: pi.set(base_key + ':base', cdx['url']) pi.execute() except Exception as e: import traceback traceback.print_exc(e) statusline = str(response.status_code) + ' ' + response.reason headers = response.headers.items() stream = response.raw status_headers = StatusAndHeaders(statusline, headers) return (status_headers, stream)
def fetch_local_file(self, uri): #fh = open(uri) fh = LocalFileLoader().load(uri) content_type, _ = mimetypes.guess_type(uri) # create fake headers for local file status_headers = StatusAndHeaders('200 OK', [('Content-Type', content_type)]) stream = fh return (status_headers, stream)
def handle_range(self, wbrequest, key, wbresponse_func, url, start, end, use_206): # key must be set assert (key) if key not in self.cache: wbrequest.custom_params['noredir'] = True response = wbresponse_func() # only cache 200 responses if not response.status_headers.get_statuscode().startswith('200'): return response.status_headers, response.body if not self.temp_dir: self.temp_dir = mkdtemp(prefix='_pywbcache') else: pass #self._check_dir_size(self.temp_dir) with NamedTemporaryFile(delete=False, dir=self.temp_dir) as fh: for obj in response.body: fh.write(obj) name = fh.name spec = dict(name=fh.name, headers=response.status_headers.headers) self.cache[key] = yaml.dump(spec) else: spec = yaml.load(self.cache[key]) spec['headers'] = [tuple(x) for x in spec['headers']] filelen = os.path.getsize(spec['name']) maxlen = filelen - start if end: maxlen = min(maxlen, end - start + 1) def read_range(): with open(spec['name'], 'rb') as fh: fh.seek(start) fh = LimitReader.wrap_stream(fh, maxlen) while True: buf = fh.read() if not buf: break yield buf status_headers = StatusAndHeaders('200 OK', spec['headers']) if use_206: StatusAndHeaders.add_range(status_headers, start, maxlen, filelen) status_headers.replace_header('Content-Length', str(maxlen)) return status_headers, read_range()
def test_resp_3(): resp = vars(WbResponse.redir_response('http://example.com/otherfile')) expected = { 'body': [], 'status_headers': StatusAndHeaders(protocol='', statusline='302 Redirect', headers=[('Location', 'http://example.com/otherfile'), ('Content-Length', '0')]) } assert (resp == expected)
def test_resp_1(): resp = vars(WbResponse.text_response('Test')) expected = { 'body': [b'Test'], 'status_headers': StatusAndHeaders(protocol='', statusline='200 OK', headers=[('Content-Type', 'text/plain; charset=utf-8'), ('Content-Length', '4')]) } assert (resp == expected)
def fetch_http(self, url, urlkey=None, env=None, req_headers=None, follow_redirects=False, ignore_proxies=False, verify=True): method = 'GET' data = None proxies = None if not ignore_proxies: proxies = self.proxies if not req_headers: req_headers = {} if env is not None: method = env['REQUEST_METHOD'].upper() input_ = env['wsgi.input'] req_headers.update(self.translate_headers(url, urlkey, env)) if method in ('POST', 'PUT'): len_ = env.get('CONTENT_LENGTH') if len_: data = LimitReader(input_, int(len_)) else: data = input_ response = requests.request(method=method, url=url, data=data, headers=req_headers, allow_redirects=follow_redirects, proxies=proxies, stream=True, verify=verify) statusline = str(response.status_code) + ' ' + response.reason headers = response.headers.items() stream = response.raw status_headers = StatusAndHeaders(statusline, headers) return (status_headers, stream)
def _redirect_if_needed(self, wbrequest, cdx): if not self.redir_to_exact: return None if wbrequest.options['is_proxy']: return None if wbrequest.custom_params.get('noredir'): return None is_timegate = (wbrequest.options.get('is_timegate', False)) if not is_timegate: is_timegate = wbrequest.wb_url.is_latest_replay() redir_needed = is_timegate or (cdx['timestamp'] != wbrequest.wb_url.timestamp) if not redir_needed: return None if self.enable_range_cache and wbrequest.extract_range(): return None #if is_timegate: # timestamp = timestamp_now() #else: timestamp = cdx['timestamp'] new_url = (wbrequest.urlrewriter.get_new_url(timestamp=timestamp, url=cdx['url'])) if wbrequest.method == 'POST': # FF shows a confirm dialog, so can't use 307 effectively # was: statusline = '307 Same-Method Internal Redirect' return None elif is_timegate: statusline = '302 Found' else: # clear cdx line to indicate internal redirect statusline = '302 Internal Redirect' cdx = None status_headers = StatusAndHeaders(statusline, [('Location', new_url)]) return self.response_class(status_headers, wbrequest=wbrequest, cdx=cdx, memento_is_redir=True)
def test_resp_4(): resp = vars(WbResponse.text_response('Test').add_range(10, 4, 100)) expected = { 'body': [b'Test'], 'status_headers': StatusAndHeaders(protocol='', statusline='206 Partial Content', headers=[('Content-Type', 'text/plain; charset=utf-8'), ('Content-Length', '4'), ('Content-Range', 'bytes 10-13/100'), ('Accept-Ranges', 'bytes')]) } assert (resp == expected)
def test_resp_2(): resp = vars( WbResponse.bin_stream([b'Test', b'Another'], content_type='text/plain; charset=utf-8', status='404')) expected = { 'body': [b'Test', b'Another'], 'status_headers': StatusAndHeaders(protocol='', statusline='404', headers=[('Content-Type', 'text/plain; charset=utf-8') ]) } assert (resp == expected)
def parse(self, stream, headerline=None): total_read = 0 def readline(): return to_native_str(stream.readline()) # if headerline passed in, use that if headerline is None: headerline = readline() else: headerline = to_native_str(headerline) header_len = len(headerline) if header_len == 0: raise EOFError() headerline = headerline.rstrip() headernames = self.headernames # if arc header, consume next two lines if headerline.startswith('filedesc://'): version = readline() # skip version spec = readline() # skip header spec, use preset one total_read += len(version) total_read += len(spec) parts = headerline.split(' ') if len(parts) != len(headernames): msg = 'Wrong # of headers, expected arc headers {0}, Found {1}' msg = msg.format(headernames, parts) raise StatusAndHeadersParserException(msg, parts) headers = [] for name, value in zip(headernames, parts): headers.append((name, value)) return StatusAndHeaders(statusline='', headers=headers, protocol='ARC/1.0', total_len=total_read)
def handle_range(self, wbrequest, key, wbresponse_func, url, start, end, use_206): # adapt handle_range so it reads ranges directly from the response bytestring # rather than creating cache files on disk: # we cache warcs in redis, and these files can be huge and don't seem to be reliably cleaned up # https://github.com/harvard-lil/perma/issues/2428 # original: https://github.com/webrecorder/pywb/blob/0.32.0/pywb/webapp/rangecache.py#L27 # begin Perma changes 1 wbrequest.custom_params['noredir'] = True response = wbresponse_func() joined = b"\n".join(response.body) filelen = len(joined) # end Perma changes 1 maxlen = filelen - start if end: maxlen = min(maxlen, end - start + 1) def read_range(): with io.BytesIO( joined ) as fh: # Perma changes 2: replaced real file w/ BytesIO fh.seek(start) fh = LimitReader.wrap_stream(fh, maxlen) while True: buf = fh.read() if not buf: break yield buf # begin Perma changes 3 status_headers = StatusAndHeaders('200 OK', response.status_headers.headers) # end Perma changes 3 if use_206: StatusAndHeaders.add_range(status_headers, start, maxlen, filelen) status_headers.replace_header('Content-Length', str(maxlen)) return status_headers, read_range()
def _test_headers(headers, status='200 OK'): rewritten = headerrewriter.rewrite(StatusAndHeaders(status, headers), urlrewriter, urlrewriter.get_cookie_rewriter()) return pprint.pprint(vars(rewritten))
""" >>> st1 = StatusAndHeadersParser(['HTTP/1.0']).parse(BytesIO(status_headers_1)) >>> st1 StatusAndHeaders(protocol = 'HTTP/1.0', statusline = '200 OK', headers = [ ('Content-Type', 'ABC'), ('Some', 'Value'), ('Multi-Line', 'Value1 Also This')]) # add range >>> StatusAndHeaders(statusline = '200 OK', headers=[('Content-Type', 'text/plain')]).add_range(10, 4, 100) StatusAndHeaders(protocol = '', statusline = '206 Partial Content', headers = [ ('Content-Type', 'text/plain'), ('Content-Range', 'bytes 10-13/100'), ('Accept-Ranges', 'bytes')]) >>> StatusAndHeadersParser(['Other']).parse(BytesIO(status_headers_1)) Traceback (most recent call last): StatusAndHeadersParserException: Expected Status Line starting with ['Other'] - Found: HTTP/1.0 200 OK # test equality op >>> st1 == StatusAndHeadersParser(['HTTP/1.0']).parse(BytesIO(status_headers_1)) True # replace header, print new headers >>> st1.replace_header('some', 'Another-Value'); st1 'Value' StatusAndHeaders(protocol = 'HTTP/1.0', statusline = '200 OK', headers = [ ('Content-Type', 'ABC'), ('Some', 'Another-Value'), ('Multi-Line', 'Value1 Also This')]) # remove header
def parse_record_stream(self, stream, statusline=None, known_format=None, no_record_parse=False): """ Parse file-like stream and return an ArcWarcRecord encapsulating the record headers, http headers (if any), and a stream limited to the remainder of the record. Pass statusline and known_format to detect_type_loader_headers() to faciliate parsing. """ (the_format, rec_headers) = (self._detect_type_load_headers( stream, statusline, known_format)) if the_format == 'arc': uri = rec_headers.get_header('uri') length = rec_headers.get_header('length') content_type = rec_headers.get_header('content-type') sub_len = rec_headers.total_len if uri and uri.startswith('filedesc://'): rec_type = 'arc_header' else: rec_type = 'response' elif the_format == 'warc': rec_type = rec_headers.get_header('WARC-Type') uri = rec_headers.get_header('WARC-Target-URI') length = rec_headers.get_header('Content-Length') content_type = rec_headers.get_header('Content-Type') sub_len = 0 is_err = False try: if length is not None: length = int(length) - sub_len if length < 0: is_err = True except (ValueError, TypeError): is_err = True # err condition if is_err: length = 0 # limit stream to the length for all valid records if length is not None and length >= 0: stream = LimitReader.wrap_stream(stream, length) # don't parse the http record at all if no_record_parse: status_headers = None #StatusAndHeaders('', []) # if empty record (error or otherwise) set status to 204 elif length == 0: if is_err: msg = '204 Possible Error' else: msg = '204 No Content' status_headers = StatusAndHeaders(msg, []) # response record or non-empty revisit: parse HTTP status and headers! elif (rec_type in ('response', 'revisit') and uri.startswith(self.HTTP_SCHEMES)): status_headers = self.http_parser.parse(stream) # request record: parse request elif ((rec_type == 'request') and uri.startswith(self.HTTP_SCHEMES)): status_headers = self.http_req_parser.parse(stream) # everything else: create a no-status entry, set content-type else: content_type_header = [('Content-Type', content_type)] if length is not None and length >= 0: content_type_header.append(('Content-Length', str(length))) status_headers = StatusAndHeaders('200 OK', content_type_header) return ArcWarcRecord(the_format, rec_type, rec_headers, stream, status_headers, content_type, length)
def handle_range(self, wbrequest, key, wbresponse_func, url, start, end, use_206): # key must be set assert(key) if key not in self.cache: wbrequest.custom_params['noredir'] = True response = wbresponse_func() # only cache 200 responses if not response.status_headers.get_statuscode().startswith('200'): return response.status_headers, response.body if not self.temp_dir: self.temp_dir = mkdtemp(prefix='_pywbcache') else: pass #self._check_dir_size(self.temp_dir) with NamedTemporaryFile(delete=False, dir=self.temp_dir) as fh: for obj in response.body: fh.write(obj) name = fh.name spec = dict(name=fh.name, headers=response.status_headers.headers) self.cache[key] = yaml.dump(spec) else: spec = yaml.load(self.cache[key]) spec['headers'] = [tuple(x) for x in spec['headers']] filelen = os.path.getsize(spec['name']) maxlen = filelen - start if end: maxlen = min(maxlen, end - start + 1) def read_range(): with open(spec['name'], 'rb') as fh: fh.seek(start) fh = LimitReader.wrap_stream(fh, maxlen) while True: buf = fh.read() if not buf: break yield buf status_headers = StatusAndHeaders('200 OK', spec['headers']) if use_206: StatusAndHeaders.add_range(status_headers, start, maxlen, filelen) status_headers.replace_header('Content-Length', str(maxlen)) return status_headers, read_range()
def _test_head_data(headers, status='200 OK', rewriter=urlrewriter): rewritten = headerrewriter.rewrite(StatusAndHeaders(status, headers), rewriter, rewriter.get_cookie_rewriter()) return rewritten.status_headers
def load_resource(self, cdx, params): load_url = cdx.get('load_url') if not load_url: return None if params.get('content_type') == VideoLoader.CONTENT_TYPE: return None input_req = params['_input_req'] req_headers = input_req.get_req_headers() dt = timestamp_to_datetime(cdx['timestamp']) if cdx.get('memento_url'): req_headers['Accept-Datetime'] = datetime_to_http_date(dt) method = input_req.get_req_method() data = input_req.get_req_body() p = PreparedRequest() p.prepare_url(load_url, None) p.prepare_headers(None) p.prepare_auth(None, load_url) auth = p.headers.get('Authorization') if auth: req_headers['Authorization'] = auth load_url = p.url try: upstream_res = self.pool.urlopen(method=method, url=load_url, body=data, headers=req_headers, redirect=False, assert_same_host=False, preload_content=False, decode_content=False, retries=self.num_retries, timeout=params.get('_timeout')) except Exception as e: raise LiveResourceException(load_url) memento_dt = upstream_res.headers.get('Memento-Datetime') if memento_dt: dt = http_date_to_datetime(memento_dt) cdx['timestamp'] = datetime_to_timestamp(dt) elif cdx.get('memento_url'): # if 'memento_url' set and no Memento-Datetime header present # then its an error return None agg_type = upstream_res.headers.get('WebAgg-Type') if agg_type == 'warc': cdx['source'] = unquote(upstream_res.headers.get('WebAgg-Source-Coll')) return None, upstream_res.headers, upstream_res self.raise_on_self_redirect(params, cdx, str(upstream_res.status), upstream_res.headers.get('Location')) if upstream_res.version == 11: version = '1.1' else: version = '1.0' status = 'HTTP/{version} {status} {reason}\r\n' status = status.format(version=version, status=upstream_res.status, reason=upstream_res.reason) http_headers_buff = status orig_resp = upstream_res._original_response try: #pragma: no cover #PY 3 resp_headers = orig_resp.headers._headers for n, v in resp_headers: if n.lower() in self.SKIP_HEADERS: continue http_headers_buff += n + ': ' + v + '\r\n' except: #pragma: no cover #PY 2 resp_headers = orig_resp.msg.headers for n, v in zip(orig_resp.getheaders(), resp_headers): if n in self.SKIP_HEADERS: continue http_headers_buff += v http_headers_buff += '\r\n' http_headers_buff = http_headers_buff.encode('latin-1') try: fp = upstream_res._fp.fp if hasattr(fp, 'raw'): #pragma: no cover fp = fp.raw remote_ip = fp._sock.getpeername()[0] except: #pragma: no cover remote_ip = None warc_headers = {} warc_headers['WARC-Type'] = 'response' warc_headers['WARC-Record-ID'] = self._make_warc_id() warc_headers['WARC-Target-URI'] = cdx['url'] warc_headers['WARC-Date'] = datetime_to_iso_date(dt) if remote_ip: warc_headers['WARC-IP-Address'] = remote_ip warc_headers['Content-Type'] = 'application/http; msgtype=response' self._set_content_len(upstream_res.headers.get('Content-Length', -1), warc_headers, len(http_headers_buff)) warc_headers = StatusAndHeaders('WARC/1.0', warc_headers.items()) return (warc_headers, http_headers_buff, upstream_res)
def redir_response(location, status='302 Redirect', headers=None): redir_headers = [('Location', location), ('Content-Length', '0')] if headers: redir_headers += headers return WbResponse(StatusAndHeaders(status, redir_headers))
def fetch_http(self, url, urlkey=None, env=None, req_headers=None, follow_redirects=False, skip_recording=False, verify=True): method = 'GET' data = None proxies = None if not skip_recording: proxies = self.proxies if not req_headers: req_headers = {} if env is not None: method = env['REQUEST_METHOD'].upper() input_ = env['wsgi.input'] req_headers.update(self.translate_headers(url, urlkey, env)) if method in ('POST', 'PUT'): len_ = env.get('CONTENT_LENGTH') if len_: data = LimitReader(input_, int(len_)) else: data = input_ response = self.live_request(method=method, url=url, data=data, headers=req_headers, allow_redirects=follow_redirects, proxies=proxies, stream=True, verify=verify) statusline = str(response.status_code) + ' ' + response.reason headers = response.headers.items() stream = response.raw try: #pragma: no cover #PY 3 headers = stream._original_response.headers._headers except: #pragma: no cover #PY 2 headers = [] resp_headers = stream._original_response.msg.headers for h in resp_headers: n, v = h.split(':', 1) n = n.strip() v = v.strip() headers.append((n, v)) status_headers = StatusAndHeaders(statusline, headers) return (status_headers, stream)