def load(self, url, offset=0, length=-1): """ Load a file-like reader from the local file system """ # if starting with . or /, can only be a file path.. file_only = url.startswith(('/', '.')) # convert to filename filename = from_file_url(url) if filename != url: file_only = True url = filename afile = None try: # first, try as file afile = open(url, 'rb') except IOError: no_except_close(afile) if file_only: raise return super(LocalFileLoader, self).load(url, offset, length) if offset > 0: afile.seek(offset) if length >= 0: return LimitReader(afile, length) else: return afile
def _do_request(self, method, load_url, data, req_headers, params, is_live): adapter = DefaultAdapters.live_adapter if is_live else DefaultAdapters.remote_adapter max_retries = adapter.max_retries if SOCKS_PROXIES: conn = adapter.get_connection(load_url, SOCKS_PROXIES) else: conn = adapter.poolmanager upstream_res = None try: upstream_res = conn.urlopen(method=method, url=load_url, body=data, headers=req_headers, redirect=False, assert_same_host=False, preload_content=False, decode_content=False, retries=max_retries, timeout=params.get('_timeout')) return upstream_res except Exception as e: if upstream_res: no_except_close(upstream_res) if logger.isEnabledFor(logging.DEBUG): import traceback traceback.print_exc() logger.debug('FAILED: ' + method + ' ' + load_url + ': ' + str(e)) raise LiveResourceException(load_url)
def handle_timegate(self, params, timestamp): url = params['url'] load_url = self.timegate_url.format(url=url, timestamp=timestamp) res = None try: headers = self._get_headers(params) res = self.sesh.head(load_url, headers=headers) except Exception as e: no_except_close(res) raise NotFoundException(url) if res and res.headers.get('Memento-Datetime'): if res.status_code >= 400: no_except_close(res) raise NotFoundException(url) if res.status_code >= 300: info = self._extract_location(url, res.headers.get('Location')) else: info = self._extract_location( url, res.headers.get('Content-Location')) url, timestamp, load_url = info cdx = CDXObject() cdx['urlkey'] = canonicalize(url) cdx['timestamp'] = timestamp cdx['url'] = url cdx['load_url'] = load_url if 'Referer' in headers: cdx['set_referrer'] = headers['Referer'] return iter([cdx])
def load_yaml_config(config_file): config = None configdata = None try: configdata = load(config_file) config = yaml.load(configdata, Loader=yaml.Loader) finally: no_except_close(configdata) return config
def _close_file(self, fh): try: if os.name != 'nt': portalocker.lock(fh, portalocker.LOCK_UN) return True except Exception as e: print(e) return False finally: no_except_close(fh)
def rewrite_text_stream_to_gen(self, stream, rwinfo): """ Convert stream to generator using applying rewriting func to each portion of the stream. Align to line boundaries if needed. """ try: buff = self.first_buff # for html rewriting: # if charset is utf-8, use that, otherwise default to encode to ascii-compatible encoding # encoding only used for url rewriting, encoding back to bytes after rewriting if rwinfo.charset == 'utf-8' and rwinfo.text_type == 'html': charset = 'utf-8' else: charset = 'iso-8859-1' if buff: yield buff.encode(charset) decoder = codecs.getincrementaldecoder(charset)() while True: buff = stream.read(BUFF_SIZE) if not buff: break if self.align_to_line: buff += stream.readline() try: buff = decoder.decode(buff) except UnicodeDecodeError: if charset == 'utf-8': rwinfo.charset = 'iso-8859-1' charset = rwinfo.charset decoder = codecs.getincrementaldecoder(charset)() buff = decoder.decode(buff) buff = self.rewrite(buff) yield buff.encode(charset) # For adding a tail/handling final buffer buff = self.final_read() # ensure decoder is marked as finished (final buffer already decoded) decoder.decode(b'', final=True) if buff: yield buff.encode(charset) finally: no_except_close(stream)
def __call__(self, env, start_response): """Callable definition to allow WbResponse control over how the response is sent :param dict env: The WSGI environment dictionary :param function start_response: The WSGI start_response function :return: The response body """ start_response(self.status_headers.statusline, self.status_headers.headers) request_method = env['REQUEST_METHOD'] if request_method == 'HEAD' or request_method == 'OPTIONS' or self.status_headers.statusline.startswith('304'): no_except_close(self.body) return [] return self.body
def _put_record(self, request_uri, input_buff, record_type, headers, params, start_response): if record_type == 'stream': if self.writer.write_stream_to_file(params, input_buff): msg = {'success': 'true'} else: msg = {'error_message': 'upload_error'} return self.send_message(msg, '200 OK', start_response) req_stream = None try: req_stream = ReqWrapper(input_buff, headers, params, self.create_buff_func) while True: buff = req_stream.read() if not buff: break content_type = headers.get('Content-Type') payload_length = req_stream.out.tell() req_stream.out.seek(0) record = self.writer.create_warc_record( uri=params['url'], record_type=record_type, payload=req_stream.out, length=payload_length, warc_content_type=content_type, warc_headers_dict=req_stream.headers) self.writer.write_record(record, params) msg = { 'success': 'true', 'WARC-Date': record.rec_headers.get_header('WARC-Date') } finally: if req_stream: no_except_close(req_stream.out) return self.send_message(msg, '200 OK', start_response)
def handle_timemap(self, params): url = res_template(self.timemap_url, params) headers = self._get_headers(params) res = None try: res = self.sesh.get(url, headers=headers, timeout=params.get('_timeout')) res.raise_for_status() except Exception as e: no_except_close(res) self.logger.debug('FAILED: ' + str(e)) raise NotFoundException(url) links = res.text return self.links_to_cdxobject(links, 'timemap')
def __call__(self, cdx, failed_files, cdx_loader, *args, **kwargs): headers_record, payload_record = self.load_headers_and_payload( cdx, failed_files, cdx_loader) # Default handling logic when loading http status/headers # special case: set header to payload if old-style revisit # with missing header if not headers_record: headers_record = payload_record elif headers_record != payload_record: # close remainder of stream as this record only used for # (already parsed) headers no_except_close(headers_record.raw_stream) # special case: check if headers record is actually empty # (eg empty revisit), then use headers from revisit if not headers_record.http_headers.headers: headers_record = payload_record if not headers_record or not payload_record: if headers_record: no_except_close(headers_record.raw_stream) if payload_record: no_except_close(payload_record.raw_stream) raise ArchiveLoadFailed('Could not load ' + str(cdx)) # ensure status line is valid from here headers_record.http_headers.validate_statusline('204 No Content') return (headers_record.http_headers, payload_record.raw_stream)
def _write_one(self): req_pay = None resp_pay = None try: result = self.write_queue.get() req_head, req_pay, resp_head, resp_pay, params = result resp_length = resp_pay.tell() resp_pay.seek(0) resp = ArcWarcRecordLoader().parse_record_stream(resp_pay) if resp.rec_type == 'response': uri = resp.rec_headers.get_header('WARC-Target-Uri') req_length = req_pay.tell() req_pay.seek(0) req = self.writer.create_warc_record( uri=uri, record_type='request', payload=req_pay, length=req_length, warc_headers_dict=req_head) self.writer.write_request_response_pair(req, resp, params) else: self.writer.write_record(resp, params) finally: try: if req_pay: no_except_close(req_pay) if resp_pay: no_except_close(resp_pay) except Exception as e: traceback.print_exc()
def _do_request(self, method, load_url, data, req_headers, params, is_live): adapter = DefaultAdapters.live_adapter if is_live else DefaultAdapters.remote_adapter max_retries = adapter.max_retries # get either the poolmanager or proxy manager to handle this connection if self.socks_proxy and not os.environ.get('SOCKS_DISABLE'): manager = adapter.proxy_manager_for(self.socks_proxy) else: manager = adapter.poolmanager upstream_res = None try: upstream_res = manager.urlopen(method=method, url=load_url, body=data, headers=req_headers, redirect=False, assert_same_host=False, preload_content=False, decode_content=False, retries=max_retries, timeout=params.get('_timeout')) return upstream_res except Exception as e: if upstream_res: no_except_close(upstream_res) if logger.isEnabledFor(logging.DEBUG): import traceback traceback.print_exc() logger.debug('FAILED: ' + method + ' ' + load_url + ': ' + str(e)) raise LiveResourceException(load_url)
def _write_to_file(self): skipping = False try: if self.interrupted: skipping = True if not skipping: entry = (self.req.headers, self.req.out, self.headers, self.out, self.params) self.queue.put(entry) except Exception: traceback.print_exc() skipping = True finally: if skipping: no_except_close(self.out) no_except_close(self.req.out) no_except_close(self.req) self.req = None
def render_content(self, wb_url, kwargs, environ): wb_url = wb_url.replace('#', '%23') wb_url = WbUrl(wb_url) proto = environ.get('HTTP_X_FORWARDED_PROTO', self.force_scheme) if proto: environ['wsgi.url_scheme'] = proto history_page = environ.pop('HTTP_X_WOMBAT_HISTORY_PAGE', '') if history_page: wb_url.url = history_page is_ajax = True else: is_ajax = self.is_ajax(environ) is_timegate = self._check_accept_dt(wb_url, environ) host_prefix = self.get_host_prefix(environ) rel_prefix = self.get_rel_prefix(environ) full_prefix = host_prefix + rel_prefix environ['pywb.host_prefix'] = host_prefix pywb_static_prefix = host_prefix + environ.get( 'pywb.app_prefix', '') + environ.get('pywb.static_prefix', '/static/') is_proxy = ('wsgiprox.proxy_host' in environ) response = self.handle_custom_response(environ, wb_url, full_prefix, host_prefix, kwargs) if response: return self.format_response(response, wb_url, full_prefix, is_timegate, is_proxy) if is_proxy: environ['pywb_proxy_magic'] = environ['wsgiprox.proxy_host'] urlrewriter = IdentityUrlRewriter(wb_url, '') framed_replay = False else: urlrewriter = UrlRewriter(wb_url, prefix=full_prefix, full_prefix=full_prefix, rel_prefix=rel_prefix, pywb_static_prefix=pywb_static_prefix) framed_replay = self.framed_replay url_parts = urlsplit(wb_url.url) if not url_parts.path: return self.send_redirect('/', url_parts, urlrewriter) self.unrewrite_referrer(environ, full_prefix) urlkey = canonicalize(wb_url.url) if self.use_js_obj_proxy: content_rw = self.js_proxy_rw else: content_rw = self.default_rw inputreq = RewriteInputRequest(environ, urlkey, wb_url.url, content_rw) inputreq.include_method_query(wb_url.url) range_start, range_end, skip_record = self._check_range( inputreq, wb_url) setcookie_headers = None cookie_key = None if self.cookie_tracker: cookie_key = self.get_cookie_key(kwargs) if cookie_key: res = self.cookie_tracker.get_cookie_headers( wb_url.url, urlrewriter, cookie_key, environ.get('HTTP_COOKIE', '')) inputreq.extra_cookie, setcookie_headers = res r = self._do_req(inputreq, wb_url, kwargs, skip_record) if r.status_code >= 400: error = None try: error = r.raw.read() except Exception: pass finally: no_except_close(r.raw) if error: error = error.decode('utf-8') else: error = '' details = dict(args=kwargs, error=error) raise UpstreamException(r.status_code, url=wb_url.url, details=details) cdx = CDXObject(r.headers.get('Warcserver-Cdx').encode('utf-8')) cdx_url_parts = urlsplit(cdx['url']) if cdx_url_parts.path.endswith( '/') and not url_parts.path.endswith('/'): # add trailing slash new_path = url_parts.path + '/' no_except_close(r.raw) return self.send_redirect(new_path, url_parts, urlrewriter) stream = BufferedReader(r.raw, block_size=BUFF_SIZE) record = self.loader.parse_record_stream(stream, ensure_http_headers=True) memento_dt = r.headers.get('Memento-Datetime') target_uri = r.headers.get('WARC-Target-URI') # cdx['urlkey'] = urlkey # cdx['timestamp'] = http_date_to_timestamp(memento_dt) # cdx['url'] = target_uri set_content_loc = False # Check if Fuzzy Match if target_uri != wb_url.url and cdx.get('is_fuzzy') == '1': set_content_loc = True # if redir to exact, redir if url or ts are different if self.redirect_to_exact: if (set_content_loc or (wb_url.timestamp != cdx.get('timestamp') and not cdx.get('is_live'))): new_url = urlrewriter.get_new_url(url=target_uri, timestamp=cdx['timestamp'], mod=wb_url.mod) resp = WbResponse.redir_response(new_url, '307 Temporary Redirect') if self.enable_memento: if is_timegate and not is_proxy: self._add_memento_links(target_uri, full_prefix, memento_dt, cdx['timestamp'], resp.status_headers, is_timegate, is_proxy) else: resp.status_headers['Link'] = MementoUtils.make_link( target_uri, 'original') return resp self._add_custom_params(cdx, r.headers, kwargs, record) if self._add_range(record, wb_url, range_start, range_end): wb_url.mod = 'id_' if is_ajax: head_insert_func = None urlrewriter.rewrite_opts['is_ajax'] = True else: top_url = self.get_top_url(full_prefix, wb_url, cdx, kwargs) head_insert_func = (self.head_insert_view.create_insert_func( wb_url, full_prefix, host_prefix, top_url, environ, framed_replay, coll=kwargs.get('coll', ''), replay_mod=self.replay_mod, config=self.config)) cookie_rewriter = None if self.cookie_tracker and cookie_key: # skip add cookie if service worker is not 200 # it seems cookie headers from service workers are not applied, so don't update in cache if wb_url.mod == 'sw_': cookie_key = None cookie_rewriter = self.cookie_tracker.get_rewriter( urlrewriter, cookie_key) urlrewriter.rewrite_opts['ua_string'] = environ.get('HTTP_USER_AGENT') result = content_rw(record, urlrewriter, cookie_rewriter, head_insert_func, cdx, environ) status_headers, gen, is_rw = result if history_page: title = DefaultRewriter._extract_title(gen) if not title: title = unquote(environ.get('HTTP_X_WOMBAT_HISTORY_TITLE', '')) if not title: title = history_page self._add_history_page(cdx, kwargs, title) return WbResponse.json_response({'title': title}) if setcookie_headers: status_headers.headers.extend(setcookie_headers) if ' ' not in status_headers.statusline: status_headers.statusline += ' None' if not is_ajax and self.enable_memento: self._add_memento_links(cdx['url'], full_prefix, memento_dt, cdx['timestamp'], status_headers, is_timegate, is_proxy, cdx.get('source-coll')) set_content_loc = True if set_content_loc and not self.redirect_to_exact: status_headers.headers.append( ('Content-Location', urlrewriter.get_new_url(timestamp=cdx['timestamp'], url=cdx['url']))) if not is_proxy: self.add_csp_header(wb_url, status_headers) response = WbResponse(status_headers, gen) return response
def load_resource(self, cdx, params): if cdx.get('_cached_result'): return cdx.get('_cached_result') if not cdx.get('filename') or cdx.get('offset') is None: return None orig_source = cdx.get('source', '').split(':')[0] formatter = ParamFormatter(params, orig_source) cdx._formatter = formatter def local_index_query(local_params): for n, v in six.iteritems(params): if n.startswith('param.'): local_params[n] = v cdx_iter, errs = self.cdx_source(local_params) for cdx in cdx_iter: cdx._formatter = formatter yield cdx failed_files = [] headers, payload = (self.resolve_loader.load_headers_and_payload( cdx, failed_files, local_index_query)) http_headers_buff = None if payload.rec_type in ('response', 'revisit'): status = cdx.get('status') # status may not be set for 'revisit' if not status or status.startswith('3'): http_headers = self.headers_parser.parse(payload.raw_stream) try: self.raise_on_self_redirect( params, cdx, http_headers.get_statuscode(), http_headers.get_header('Location')) except LiveResourceException: no_except_close(headers.raw_stream) no_except_close(payload.raw_stream) raise http_headers_buff = http_headers.to_bytes() warc_headers = payload.rec_headers if headers != payload: warc_headers.replace_header( 'WARC-Refers-To-Target-URI', payload.rec_headers.get_header('WARC-Target-URI')) warc_headers.replace_header( 'WARC-Refers-To-Date', payload.rec_headers.get_header('WARC-Date')) warc_headers.replace_header( 'WARC-Target-URI', headers.rec_headers.get_header('WARC-Target-URI')) warc_headers.replace_header( 'WARC-Date', headers.rec_headers.get_header('WARC-Date')) no_except_close(headers.raw_stream) return (warc_headers, http_headers_buff, payload.raw_stream)
def load_resource(self, cdx, params): if cdx.get('_cached_result'): return cdx.get('_cached_result') if not cdx.get('filename') or cdx.get('offset') is None: return None orig_source = cdx.get('source', '').split(':')[0] formatter = ParamFormatter(params, orig_source) cdx._formatter = formatter def local_index_query(local_params): for n, v in six.iteritems(params): if n.startswith('param.'): local_params[n] = v cdx_iter, errs = self.cdx_source(local_params) for cdx in cdx_iter: cdx._formatter = formatter yield cdx failed_files = [] headers, payload = (self.resolve_loader.load_headers_and_payload( cdx, failed_files, local_index_query)) http_headers_buff = None if payload.rec_type in ('response', 'revisit'): status = cdx.get('status') # if status is not set and not, 2xx, 4xx, 5xx # go through self-redirect check just in case if not status or not status.startswith(('2', '4', '5')): http_headers = self.headers_parser.parse(payload.raw_stream) try: orig_size = payload.raw_stream.tell() except: orig_size = 0 try: self.raise_on_self_redirect( params, cdx, http_headers.get_statuscode(), http_headers.get_header('Location')) except LiveResourceException: no_except_close(headers.raw_stream) no_except_close(payload.raw_stream) raise http_headers_buff = http_headers.to_bytes() # if new http_headers_buff is different length, # attempt to adjust content-length on the WARC record if orig_size and len(http_headers_buff) != orig_size: orig_cl = payload.rec_headers.get_header('Content-Length') if orig_cl: new_cl = int(orig_cl) + (len(http_headers_buff) - orig_size) payload.rec_headers.replace_header( 'Content-Length', str(new_cl)) warc_headers = payload.rec_headers if headers != payload: warc_headers.replace_header( 'WARC-Refers-To-Target-URI', payload.rec_headers.get_header('WARC-Target-URI')) warc_headers.replace_header( 'WARC-Refers-To-Date', payload.rec_headers.get_header('WARC-Date')) warc_headers.replace_header( 'WARC-Target-URI', headers.rec_headers.get_header('WARC-Target-URI')) warc_headers.replace_header( 'WARC-Date', headers.rec_headers.get_header('WARC-Date')) no_except_close(headers.raw_stream) return (warc_headers, http_headers_buff, payload.raw_stream)
def load_resource(self, cdx, params): if cdx.get('filename') and cdx.get('offset') is not None: return None load_url = cdx.get('load_url') if not load_url: return None if params.get('content_type') == VideoLoader.CONTENT_TYPE: return None if self.forward_proxy_prefix and not cdx.get('is_live'): load_url = self.forward_proxy_prefix + load_url input_req = params['_input_req'] req_headers = input_req.get_req_headers() dt = timestamp_to_datetime(cdx['timestamp']) if cdx.get('memento_url'): req_headers['Accept-Datetime'] = datetime_to_http_date(dt) method = input_req.get_req_method() data = input_req.get_req_body() p = PreparedRequest() try: p.prepare_url(load_url, None) except Exception: raise LiveResourceException(load_url) p.prepare_headers(None) p.prepare_auth(None, load_url) auth = p.headers.get('Authorization') if auth: req_headers['Authorization'] = auth load_url = p.url # host is set to the actual host for live loading # ensure it is set to the load_url host if not cdx.get('is_live'): #req_headers.pop('Host', '') req_headers['Host'] = urlsplit(p.url).netloc referrer = cdx.get('set_referrer') if referrer: req_headers['Referer'] = referrer upstream_res = self._do_request_with_redir_check( method, load_url, data, req_headers, params, cdx) memento_dt = upstream_res.headers.get('Memento-Datetime') if memento_dt: dt = http_date_to_datetime(memento_dt) cdx['timestamp'] = datetime_to_timestamp(dt) elif cdx.get('memento_url'): # if 'memento_url' set and no Memento-Datetime header present # then its an error no_except_close(upstream_res) return None agg_type = upstream_res.headers.get('Warcserver-Type') if agg_type == 'warc': cdx['source'] = unquote( upstream_res.headers.get('Warcserver-Source-Coll')) return None, upstream_res.headers, upstream_res if upstream_res.version == 11: version = '1.1' else: version = '1.0' status = 'HTTP/{version} {status} {reason}\r\n' status = status.format(version=version, status=upstream_res.status, reason=upstream_res.reason) http_headers_buff = status orig_resp = upstream_res._original_response try: #pragma: no cover #PY 3 resp_headers = orig_resp.headers._headers for n, v in resp_headers: nl = n.lower() if nl in self.SKIP_HEADERS: continue if nl in self.UNREWRITE_HEADERS: v = self.unrewrite_header(cdx, v) http_headers_buff += n + ': ' + v + '\r\n' http_headers_buff += '\r\n' try: # http headers could be encoded as utf-8 (though non-standard) # first try utf-8 encoding http_headers_buff = http_headers_buff.encode('utf-8') except: # then, fall back to latin-1 http_headers_buff = http_headers_buff.encode('latin-1') except: #pragma: no cover #PY 2 resp_headers = orig_resp.msg.headers for line in resp_headers: n, v = line.split(':', 1) n = n.lower() v = v.strip() if n in self.SKIP_HEADERS: continue new_v = v if n in self.UNREWRITE_HEADERS: new_v = self.unrewrite_header(cdx, v) if new_v != v: http_headers_buff += n + ': ' + new_v + '\r\n' else: http_headers_buff += line # if python2, already byte headers, so leave as is http_headers_buff += '\r\n' try: fp = upstream_res._fp.fp if hasattr(fp, 'raw'): #pragma: no cover fp = fp.raw remote_ip = fp._sock.getpeername()[0] except: #pragma: no cover remote_ip = None warc_headers = {} warc_headers['WARC-Type'] = 'response' warc_headers['WARC-Record-ID'] = self._make_warc_id() warc_headers['WARC-Target-URI'] = cdx['url'] warc_headers['WARC-Date'] = datetime_to_iso_date(dt) if not cdx.get('is_live'): now = datetime.datetime.utcnow() warc_headers['WARC-Source-URI'] = cdx.get('load_url') warc_headers['WARC-Creation-Date'] = datetime_to_iso_date(now) if remote_ip: warc_headers['WARC-IP-Address'] = remote_ip ct = upstream_res.headers.get('Content-Type') if ct: metadata = self.get_custom_metadata(ct, dt) if metadata: warc_headers['WARC-JSON-Metadata'] = json.dumps(metadata) warc_headers['Content-Type'] = 'application/http; msgtype=response' if method == 'HEAD': content_len = 0 else: content_len = upstream_res.headers.get('Content-Length', -1) self._set_content_len(content_len, warc_headers, len(http_headers_buff)) warc_headers = StatusAndHeaders('WARC/1.0', warc_headers.items()) return (warc_headers, http_headers_buff, upstream_res)
def handle_call(self, environ, start_response): input_req = DirectWSGIInputRequest(environ) params = self._get_params(environ) request_uri = input_req.get_full_request_uri() input_buff = input_req.get_req_body() headers = input_req.get_req_headers() method = input_req.get_req_method() path = environ['PATH_INFO'] # write request body as metadata/resource put_record = params.get('put_record') if put_record and method in ('PUT', 'POST'): return self._put_record(request_uri, input_buff, put_record, headers, params, start_response) skipping = any( x.skip_request(path, headers) for x in self.skip_filters) req_is_wrapped = False if not skipping: req_stream = ReqWrapper(input_buff, headers, params, self.create_buff_func) req_is_wrapped = True else: req_stream = input_buff data = None if input_buff: data = req_stream try: res = requests.request(url=self.upstream_host + request_uri, method=method, data=data, headers=headers, allow_redirects=False, stream=True) res.raise_for_status() except Exception as e: if req_is_wrapped: no_except_close(req_stream.out) return self.send_error(e, start_response) if not skipping: skipping = any( x.skip_response(path, req_stream.headers, res.headers, params) for x in self.skip_filters) if not skipping: resp_stream = RespWrapper(res.raw, res.headers, req_stream, params, self.write_queue, path, self.create_buff_func) else: resp_stream = res.raw if req_is_wrapped: no_except_close(req_stream.out) resp_iter = StreamIter(resp_stream) # ensure TE header from upstream is not included, # added automatically by wsgi app res.headers.pop('Transfer-Encoding', '') start_response('200 OK', list(res.headers.items())) return resp_iter
def compute_page_range(self, reader, query): pagesize = query.page_size if not pagesize: pagesize = self.max_blocks else: try: pagesize = int(pagesize) except ValueError: msg = 'Invalid value for pageSize= param: {}' raise CDXException(msg.format(pagesize)) last_line = None # Get End end_iter = search(reader, query.end_key, prev_size=1) try: end_line = six.next(end_iter) except StopIteration: last_line = read_last_line(reader) end_line = last_line # Get Start first_iter = iter_range(reader, query.key, query.end_key, prev_size=1) try: first_line = six.next(first_iter) except StopIteration: if end_line == last_line and query.key >= last_line: first_line = last_line else: no_except_close(reader) if query.page_count: yield self._page_info(0, pagesize, 0) return first = IDXObject(first_line) end = IDXObject(end_line) try: blocks = end['lineno'] - first['lineno'] total_pages = int(blocks / pagesize) + 1 except: blocks = -1 total_pages = 1 if query.page_count: # same line, so actually need to look at cdx # to determine if it exists if blocks == 0: try: block_cdx_iter = self.idx_to_cdx([first_line], query) block = six.next(block_cdx_iter) cdx = six.next(block) except StopIteration: total_pages = 0 blocks = -1 yield self._page_info(total_pages, pagesize, blocks + 1) no_except_close(reader) return curr_page = query.page if curr_page >= total_pages or curr_page < 0: msg = 'Page {0} invalid: First Page is 0, Last Page is {1}' no_except_close(reader) raise CDXException(msg.format(curr_page, total_pages - 1)) startline = curr_page * pagesize endline = startline + pagesize - 1 if blocks >= 0: endline = min(endline, blocks) if curr_page == 0: yield first_line else: startline -= 1 try: idxiter = itertools.islice(first_iter, startline, endline) for idx in idxiter: yield idx except Exception: pass finally: no_except_close(reader)
def render_content(self, wb_url, kwargs, environ): wb_url = wb_url.replace('#', '%23') wb_url = WbUrl(wb_url) history_page = environ.pop('HTTP_X_WOMBAT_HISTORY_PAGE', '') if history_page: wb_url.url = history_page is_ajax = True else: is_ajax = self.is_ajax(environ) is_timegate = self._check_accept_dt(wb_url, environ) self.prepare_env(environ) host_prefix = environ['pywb.host_prefix'] rel_prefix = self.get_rel_prefix(environ) full_prefix = host_prefix + rel_prefix pywb_static_prefix = environ['pywb.static_prefix'] + '/' is_proxy = ('wsgiprox.proxy_host' in environ) # if OPTIONS in proxy mode, just generate the proxy responss if is_proxy and self.is_preflight(environ): return WbResponse.options_response(environ) if self.use_js_obj_proxy: content_rw = self.js_proxy_rw else: content_rw = self.default_rw # no redirects if in proxy redirect_to_exact = self.redirect_to_exact and not is_proxy # Check Prefer pref_mod, pref_applied = self._get_prefer_mod(wb_url, environ, content_rw, is_proxy) response = None keep_frame_response = False # prefer overrides custom response? if pref_mod is not None: # fast-redirect to preferred if redirect_to_exact and not is_timegate and pref_mod != wb_url.mod: new_url = full_prefix + wb_url.to_str(mod=pref_mod) headers = [('Preference-Applied', pref_applied), ('Vary', 'Prefer')] return WbResponse.redir_response(new_url, '307 Temporary Redirect', headers=headers) else: wb_url.mod = pref_mod else: if kwargs.get('output'): response = self.handle_timemap(wb_url, kwargs, full_prefix) elif wb_url.is_query(): response = self.handle_query(environ, wb_url, kwargs, full_prefix) else: response = self.handle_custom_response(environ, wb_url, full_prefix, host_prefix, kwargs) keep_frame_response = (not kwargs.get('no_timegate_check') and is_timegate and not is_proxy) or redirect_to_exact if response and not keep_frame_response: return self.format_response(response, wb_url, full_prefix, is_timegate, is_proxy) if is_proxy: environ['pywb_proxy_magic'] = environ['wsgiprox.proxy_host'] urlrewriter = IdentityUrlRewriter(wb_url, '') framed_replay = False else: urlrewriter = UrlRewriter(wb_url, prefix=full_prefix, full_prefix=full_prefix, rel_prefix=rel_prefix, pywb_static_prefix=pywb_static_prefix) framed_replay = self.framed_replay url_parts = urlsplit(wb_url.url) if not url_parts.path: return self.send_redirect('/', url_parts, urlrewriter) self.unrewrite_referrer(environ, full_prefix) urlkey = canonicalize(wb_url.url) inputreq = RewriteInputRequest(environ, urlkey, wb_url.url, content_rw) inputreq.include_method_query(wb_url.url) range_start, range_end, skip_record = self._check_range( inputreq, wb_url) setcookie_headers = None cookie_key = None if self.cookie_tracker: cookie_key = self.get_cookie_key(kwargs) if cookie_key: res = self.cookie_tracker.get_cookie_headers( wb_url.url, urlrewriter, cookie_key, environ.get('HTTP_COOKIE', '')) inputreq.extra_cookie, setcookie_headers = res r = self._do_req(inputreq, wb_url, kwargs, skip_record) if r.status_code >= 400: error = None try: error = r.raw.read() except Exception: pass finally: no_except_close(r.raw) if error: error = error.decode('utf-8') else: error = '' details = dict(args=kwargs, error=error) if r.status_code == 404: raise NotFoundException(url=wb_url.url, msg=details) else: raise UpstreamException(r.status_code, url=wb_url.url, details=details) cdx = CDXObject(r.headers.get('Warcserver-Cdx').encode('utf-8')) cdx_url_parts = urlsplit(cdx['url']) if cdx_url_parts.path.endswith( '/') and not url_parts.path.endswith('/'): # add trailing slash new_path = url_parts.path + '/' no_except_close(r.raw) return self.send_redirect(new_path, url_parts, urlrewriter) # only redirect to exact if not live, otherwise set to false redirect_to_exact = redirect_to_exact and not cdx.get('is_live') # return top-frame timegate response, with timestamp from cdx if response and keep_frame_response and (not redirect_to_exact or not is_timegate): no_except_close(r.raw) return self.format_response(response, wb_url, full_prefix, is_timegate, is_proxy, cdx['timestamp']) stream = BufferedReader(r.raw, block_size=BUFF_SIZE) record = self.loader.parse_record_stream(stream, ensure_http_headers=True) memento_dt = r.headers.get('Memento-Datetime') target_uri = r.headers.get('WARC-Target-URI') # cdx['urlkey'] = urlkey # cdx['timestamp'] = http_date_to_timestamp(memento_dt) # cdx['url'] = target_uri set_content_loc = False # Check if Fuzzy Match if target_uri != wb_url.url and cdx.get('is_fuzzy') == '1': set_content_loc = True # if redirect to exact timestamp (only set if not live) if redirect_to_exact: if set_content_loc or is_timegate or wb_url.timestamp != cdx.get( 'timestamp'): new_url = urlrewriter.get_new_url(url=target_uri, timestamp=cdx['timestamp'], mod=wb_url.mod) resp = WbResponse.redir_response(new_url, '307 Temporary Redirect') if self.enable_memento: if is_timegate and not is_proxy: self._add_memento_links(target_uri, full_prefix, memento_dt, cdx['timestamp'], resp.status_headers, is_timegate, is_proxy, pref_applied=pref_applied, mod=pref_mod, is_memento=False) else: resp.status_headers['Link'] = MementoUtils.make_link( target_uri, 'original') return resp self._add_custom_params(cdx, r.headers, kwargs, record) if self._add_range(record, wb_url, range_start, range_end): wb_url.mod = 'id_' if is_ajax: head_insert_func = None urlrewriter.rewrite_opts['is_ajax'] = True else: top_url = self.get_top_url(full_prefix, wb_url, cdx, kwargs) head_insert_func = (self.head_insert_view.create_insert_func( wb_url, full_prefix, host_prefix, top_url, environ, framed_replay, coll=kwargs.get('coll', ''), replay_mod=self.replay_mod, metadata=kwargs.get('metadata', {}), config=self.config)) cookie_rewriter = None if self.cookie_tracker and cookie_key: # skip add cookie if service worker is not 200 # it seems cookie headers from service workers are not applied, so don't update in cache if wb_url.mod == 'sw_': cookie_key = None cookie_rewriter = self.cookie_tracker.get_rewriter( urlrewriter, cookie_key) urlrewriter.rewrite_opts['ua_string'] = environ.get('HTTP_USER_AGENT') result = content_rw(record, urlrewriter, cookie_rewriter, head_insert_func, cdx, environ) status_headers, gen, is_rw = result if history_page: title = DefaultRewriter._extract_title(gen) if not title: title = unquote(environ.get('HTTP_X_WOMBAT_HISTORY_TITLE', '')) if not title: title = history_page self._add_history_page(cdx, kwargs, title) return WbResponse.json_response({'title': title}) if setcookie_headers: status_headers.headers.extend(setcookie_headers) if ' ' not in status_headers.statusline: status_headers.statusline += ' None' if not is_ajax and self.enable_memento: self._add_memento_links(cdx['url'], full_prefix, memento_dt, cdx['timestamp'], status_headers, is_timegate, is_proxy, cdx.get('source-coll'), mod=pref_mod, pref_applied=pref_applied) set_content_loc = True if set_content_loc and not redirect_to_exact and not is_proxy: status_headers.headers.append( ('Content-Location', urlrewriter.get_new_url(timestamp=cdx['timestamp'], url=cdx['url']))) if not is_proxy: self.add_csp_header(wb_url, status_headers) response = WbResponse(status_headers, gen) if is_proxy and environ.get('HTTP_ORIGIN'): response.add_access_control_headers(environ) if r.status_code == 200 and kwargs.get( 'cache') == 'always' and environ.get('HTTP_REFERER'): response.status_headers[ 'Cache-Control'] = 'public, max-age=31536000, immutable' return response
def iter_blocks(reader): try: for r in ranges: yield decompress_block(r) finally: no_except_close(reader)