def put_custom_record(self, environ, coll): chunks = [] while True: buff = environ["wsgi.input"].read() print("LEN", len(buff)) if not buff: break chunks.append(buff) data = b"".join(chunks) params = dict(parse_qsl(environ.get("QUERY_STRING"))) rec_type = "resource" headers = {"Content-Type": environ.get("CONTENT_TYPE", "text/plain")} target_uri = params.get("url") if not target_uri: return WbResponse.json_response({"error": "no url"}) timestamp = params.get("timestamp") if timestamp: headers["WARC-Date"] = timestamp_to_iso_date(timestamp) put_url = self.custom_record_path.format(url=target_uri, coll=coll, rec_type=rec_type) res = requests.put(put_url, headers=headers, data=data) res = res.json() return WbResponse.json_response(res)
def get_wacz(self, environ, coll): # if self.pending_count != 0 or self.pending_size != 0: # return WbResponse.json_response( # {"error": "not_ready"}, status="404 Not Found" # ) params = dict(parse_qsl(environ.get("QUERY_STRING"))) archive_dir = os.path.join("collections", coll, "archive") all_warcs = [ os.path.join(archive_dir, name) for name in os.listdir(archive_dir) ] all_warcs.append("-o") all_warcs.append("/tmp/out/archive.wacz") url = params.get("url") if url: all_warcs.append("--url") all_warcs.append(url) try: wacz_main(all_warcs) except Exception as e: print(e) return WbResponse.json_response({"done": "/tmp/out/archive.wacz"})
def serve_listing(self, environ): result = { 'fixed': self.warcserver.list_fixed_routes(), 'dynamic': self.warcserver.list_dynamic_routes() } return WbResponse.json_response(result)
def exit(self, environ=None): import uwsgi import signal resp = WbResponse.json_response({}) os.kill(uwsgi.masterpid(), signal.SIGTERM) return resp
def test_wbresponse_json_response(): body = dict(pywb=1, wr=2) res = WbResponse.json_response(body) status_headers = res.status_headers assert status_headers.statusline == '200 OK' assert ('Content-Type', 'application/json; charset=utf-8') in status_headers.headers assert json.loads(res.body[0]) == body
def put_custom_record(self, environ, coll="$root"): """ When recording, PUT a custom WARC record to the specified collection (Available only when recording) :param dict environ: The WSGI environment dictionary for the request :param str coll: The name of the collection the record is to be served from """ chunks = [] while True: buff = environ["wsgi.input"].read() if not buff: break chunks.append(buff) data = b"".join(chunks) params = dict(parse_qsl(environ.get("QUERY_STRING"))) rec_type = "resource" headers = {"Content-Type": environ.get("CONTENT_TYPE", "text/plain")} target_uri = params.get("url") if not target_uri: return WbResponse.json_response({"error": "no url"}, status="400 Bad Request") timestamp = params.get("timestamp") if timestamp: headers["WARC-Date"] = timestamp_to_iso_date(timestamp) put_url = self.put_custom_record_path.format(url=target_uri, coll=coll, rec_type=rec_type) res = requests.put(put_url, headers=headers, data=data) res = res.json() return WbResponse.json_response(res)
def serve_listing(self, environ): """Serves the response for WARCServer fixed and dynamic listing (paths) :param dict environ: The WSGI environment dictionary for the request :return: WbResponse containing the frontend apps WARCServer URL paths :rtype: WbResponse """ result = {'fixed': self.warcserver.list_fixed_routes(), 'dynamic': self.warcserver.list_dynamic_routes() } return WbResponse.json_response(result)
def serve_listing(self, environ): """Serves the response for WARCServer fixed and dynamic listing (paths) :param dict environ: The WSGI environment dictionary for the request :return: WbResponse containing the frontend apps WARCServer URL paths :rtype: WbResponse """ result = { 'fixed': self.warcserver.list_fixed_routes(), 'dynamic': self.warcserver.list_dynamic_routes() } return WbResponse.json_response(result)
def put_screenshot(self, environ, coll): self.ensure_coll_exists(coll) headers = {'Content-Type': environ.get('CONTENT_TYPE', 'text/plain')} query_data = parse_qs(environ.get('QUERY_STRING')) url = query_data.get('target_uri', []) if url: url = url[0] if not url: return WbResponse.json_response({'error': 'no target_uri'}) put_url = self.screenshot_recorder_path.format(url=url, coll=coll) res = requests.put(put_url, headers=headers, data=environ['wsgi.input']) res = res.json() return WbResponse.json_response(res)
def put_record(self, environ, coll, target_uri_format, rec_type, params, data): self.ensure_coll_exists(coll) headers = {'Content-Type': environ.get('CONTENT_TYPE', 'text/plain')} url = params.get('url') if not url: return WbResponse.json_response({'error': 'no url'}) timestamp = params.get('timestamp') if timestamp: headers['WARC-Date'] = timestamp_to_iso_date(timestamp) target_uri = target_uri_format.format(url=url) put_url = self.custom_record_path.format( url=target_uri, coll=coll, rec_type=rec_type ) res = requests.put(put_url, headers=headers, data=data) res = res.json() return WbResponse.json_response(res)
def test_wbresponse_callable(): expected_body = dict(pywb=1, wr=2) res = WbResponse.json_response(expected_body) env = dict(REQUEST_METHOD='GET') expected_passed_values = dict(status_line='200 OK', headers=[('Content-Type', 'application/json; charset=utf-8'), ('Content-Length', '17')]) passed_values = dict(status_line=None, headers=None) def start_response(status_line, headers): passed_values['status_line'] = status_line passed_values['headers'] = headers body = res(env, start_response) assert json.loads(body[0]) == expected_body assert passed_values == expected_passed_values
def get_pending(self, environ): return WbResponse.json_response({ "count": self.pending_count, "size": self.pending_size })
def render_content(self, wb_url, kwargs, environ): wb_url = wb_url.replace('#', '%23') wb_url = WbUrl(wb_url) history_page = environ.pop('HTTP_X_WOMBAT_HISTORY_PAGE', '') if history_page: wb_url.url = history_page is_ajax = True else: is_ajax = self.is_ajax(environ) is_timegate = self._check_accept_dt(wb_url, environ) self.prepare_env(environ) host_prefix = environ['pywb.host_prefix'] rel_prefix = self.get_rel_prefix(environ) full_prefix = host_prefix + rel_prefix pywb_static_prefix = environ['pywb.static_prefix'] + '/' is_proxy = ('wsgiprox.proxy_host' in environ) # if OPTIONS in proxy mode, just generate the proxy responss if is_proxy and self.is_preflight(environ): return WbResponse.options_response(environ) if self.use_js_obj_proxy: content_rw = self.js_proxy_rw else: content_rw = self.default_rw # no redirects if in proxy redirect_to_exact = self.redirect_to_exact and not is_proxy # Check Prefer pref_mod, pref_applied = self._get_prefer_mod(wb_url, environ, content_rw, is_proxy) response = None keep_frame_response = False # prefer overrides custom response? if pref_mod is not None: # fast-redirect to preferred if redirect_to_exact and not is_timegate and pref_mod != wb_url.mod: new_url = full_prefix + wb_url.to_str(mod=pref_mod) headers = [('Preference-Applied', pref_applied), ('Vary', 'Prefer')] return WbResponse.redir_response(new_url, '307 Temporary Redirect', headers=headers) else: wb_url.mod = pref_mod else: if kwargs.get('output'): response = self.handle_timemap(wb_url, kwargs, full_prefix) elif wb_url.is_query(): response = self.handle_query(environ, wb_url, kwargs, full_prefix) else: response = self.handle_custom_response(environ, wb_url, full_prefix, host_prefix, kwargs) keep_frame_response = (not kwargs.get('no_timegate_check') and is_timegate and not is_proxy) or redirect_to_exact if response and not keep_frame_response: return self.format_response(response, wb_url, full_prefix, is_timegate, is_proxy) if is_proxy: environ['pywb_proxy_magic'] = environ['wsgiprox.proxy_host'] urlrewriter = IdentityUrlRewriter(wb_url, '') framed_replay = False else: urlrewriter = UrlRewriter(wb_url, prefix=full_prefix, full_prefix=full_prefix, rel_prefix=rel_prefix, pywb_static_prefix=pywb_static_prefix) framed_replay = self.framed_replay url_parts = urlsplit(wb_url.url) if not url_parts.path: return self.send_redirect('/', url_parts, urlrewriter) self.unrewrite_referrer(environ, full_prefix) urlkey = canonicalize(wb_url.url) inputreq = RewriteInputRequest(environ, urlkey, wb_url.url, content_rw) inputreq.include_method_query(wb_url.url) range_start, range_end, skip_record = self._check_range( inputreq, wb_url) setcookie_headers = None cookie_key = None if self.cookie_tracker: cookie_key = self.get_cookie_key(kwargs) if cookie_key: res = self.cookie_tracker.get_cookie_headers( wb_url.url, urlrewriter, cookie_key, environ.get('HTTP_COOKIE', '')) inputreq.extra_cookie, setcookie_headers = res r = self._do_req(inputreq, wb_url, kwargs, skip_record) if r.status_code >= 400: error = None try: error = r.raw.read() except Exception: pass finally: no_except_close(r.raw) if error: error = error.decode('utf-8') else: error = '' details = dict(args=kwargs, error=error) if r.status_code == 404: raise NotFoundException(url=wb_url.url, msg=details) else: raise UpstreamException(r.status_code, url=wb_url.url, details=details) cdx = CDXObject(r.headers.get('Warcserver-Cdx').encode('utf-8')) cdx_url_parts = urlsplit(cdx['url']) if cdx_url_parts.path.endswith( '/') and not url_parts.path.endswith('/'): # add trailing slash new_path = url_parts.path + '/' no_except_close(r.raw) return self.send_redirect(new_path, url_parts, urlrewriter) # only redirect to exact if not live, otherwise set to false redirect_to_exact = redirect_to_exact and not cdx.get('is_live') # return top-frame timegate response, with timestamp from cdx if response and keep_frame_response and (not redirect_to_exact or not is_timegate): no_except_close(r.raw) return self.format_response(response, wb_url, full_prefix, is_timegate, is_proxy, cdx['timestamp']) stream = BufferedReader(r.raw, block_size=BUFF_SIZE) record = self.loader.parse_record_stream(stream, ensure_http_headers=True) memento_dt = r.headers.get('Memento-Datetime') target_uri = r.headers.get('WARC-Target-URI') # cdx['urlkey'] = urlkey # cdx['timestamp'] = http_date_to_timestamp(memento_dt) # cdx['url'] = target_uri set_content_loc = False # Check if Fuzzy Match if target_uri != wb_url.url and cdx.get('is_fuzzy') == '1': set_content_loc = True # if redirect to exact timestamp (only set if not live) if redirect_to_exact: if set_content_loc or is_timegate or wb_url.timestamp != cdx.get( 'timestamp'): new_url = urlrewriter.get_new_url(url=target_uri, timestamp=cdx['timestamp'], mod=wb_url.mod) resp = WbResponse.redir_response(new_url, '307 Temporary Redirect') if self.enable_memento: if is_timegate and not is_proxy: self._add_memento_links(target_uri, full_prefix, memento_dt, cdx['timestamp'], resp.status_headers, is_timegate, is_proxy, pref_applied=pref_applied, mod=pref_mod, is_memento=False) else: resp.status_headers['Link'] = MementoUtils.make_link( target_uri, 'original') return resp self._add_custom_params(cdx, r.headers, kwargs, record) if self._add_range(record, wb_url, range_start, range_end): wb_url.mod = 'id_' if is_ajax: head_insert_func = None urlrewriter.rewrite_opts['is_ajax'] = True else: top_url = self.get_top_url(full_prefix, wb_url, cdx, kwargs) head_insert_func = (self.head_insert_view.create_insert_func( wb_url, full_prefix, host_prefix, top_url, environ, framed_replay, coll=kwargs.get('coll', ''), replay_mod=self.replay_mod, metadata=kwargs.get('metadata', {}), config=self.config)) cookie_rewriter = None if self.cookie_tracker and cookie_key: # skip add cookie if service worker is not 200 # it seems cookie headers from service workers are not applied, so don't update in cache if wb_url.mod == 'sw_': cookie_key = None cookie_rewriter = self.cookie_tracker.get_rewriter( urlrewriter, cookie_key) urlrewriter.rewrite_opts['ua_string'] = environ.get('HTTP_USER_AGENT') result = content_rw(record, urlrewriter, cookie_rewriter, head_insert_func, cdx, environ) status_headers, gen, is_rw = result if history_page: title = DefaultRewriter._extract_title(gen) if not title: title = unquote(environ.get('HTTP_X_WOMBAT_HISTORY_TITLE', '')) if not title: title = history_page self._add_history_page(cdx, kwargs, title) return WbResponse.json_response({'title': title}) if setcookie_headers: status_headers.headers.extend(setcookie_headers) if ' ' not in status_headers.statusline: status_headers.statusline += ' None' if not is_ajax and self.enable_memento: self._add_memento_links(cdx['url'], full_prefix, memento_dt, cdx['timestamp'], status_headers, is_timegate, is_proxy, cdx.get('source-coll'), mod=pref_mod, pref_applied=pref_applied) set_content_loc = True if set_content_loc and not redirect_to_exact and not is_proxy: status_headers.headers.append( ('Content-Location', urlrewriter.get_new_url(timestamp=cdx['timestamp'], url=cdx['url']))) if not is_proxy: self.add_csp_header(wb_url, status_headers) response = WbResponse(status_headers, gen) if is_proxy and environ.get('HTTP_ORIGIN'): response.add_access_control_headers(environ) if r.status_code == 200 and kwargs.get( 'cache') == 'always' and environ.get('HTTP_REFERER'): response.status_headers[ 'Cache-Control'] = 'public, max-age=31536000, immutable' return response
def render_content(self, wb_url, kwargs, environ): wb_url = wb_url.replace('#', '%23') wb_url = WbUrl(wb_url) proto = environ.get('HTTP_X_FORWARDED_PROTO', self.force_scheme) if proto: environ['wsgi.url_scheme'] = proto history_page = environ.pop('HTTP_X_WOMBAT_HISTORY_PAGE', '') if history_page: wb_url.url = history_page is_ajax = True else: is_ajax = self.is_ajax(environ) is_timegate = self._check_accept_dt(wb_url, environ) host_prefix = self.get_host_prefix(environ) rel_prefix = self.get_rel_prefix(environ) full_prefix = host_prefix + rel_prefix environ['pywb.host_prefix'] = host_prefix pywb_static_prefix = host_prefix + environ.get( 'pywb.app_prefix', '') + environ.get('pywb.static_prefix', '/static/') is_proxy = ('wsgiprox.proxy_host' in environ) response = self.handle_custom_response(environ, wb_url, full_prefix, host_prefix, kwargs) if response: return self.format_response(response, wb_url, full_prefix, is_timegate, is_proxy) if is_proxy: environ['pywb_proxy_magic'] = environ['wsgiprox.proxy_host'] urlrewriter = IdentityUrlRewriter(wb_url, '') framed_replay = False else: urlrewriter = UrlRewriter(wb_url, prefix=full_prefix, full_prefix=full_prefix, rel_prefix=rel_prefix, pywb_static_prefix=pywb_static_prefix) framed_replay = self.framed_replay url_parts = urlsplit(wb_url.url) if not url_parts.path: return self.send_redirect('/', url_parts, urlrewriter) self.unrewrite_referrer(environ, full_prefix) urlkey = canonicalize(wb_url.url) if self.use_js_obj_proxy: content_rw = self.js_proxy_rw else: content_rw = self.default_rw inputreq = RewriteInputRequest(environ, urlkey, wb_url.url, content_rw) inputreq.include_method_query(wb_url.url) range_start, range_end, skip_record = self._check_range( inputreq, wb_url) setcookie_headers = None cookie_key = None if self.cookie_tracker: cookie_key = self.get_cookie_key(kwargs) if cookie_key: res = self.cookie_tracker.get_cookie_headers( wb_url.url, urlrewriter, cookie_key, environ.get('HTTP_COOKIE', '')) inputreq.extra_cookie, setcookie_headers = res r = self._do_req(inputreq, wb_url, kwargs, skip_record) if r.status_code >= 400: error = None try: error = r.raw.read() except Exception: pass finally: no_except_close(r.raw) if error: error = error.decode('utf-8') else: error = '' details = dict(args=kwargs, error=error) raise UpstreamException(r.status_code, url=wb_url.url, details=details) cdx = CDXObject(r.headers.get('Warcserver-Cdx').encode('utf-8')) cdx_url_parts = urlsplit(cdx['url']) if cdx_url_parts.path.endswith( '/') and not url_parts.path.endswith('/'): # add trailing slash new_path = url_parts.path + '/' no_except_close(r.raw) return self.send_redirect(new_path, url_parts, urlrewriter) stream = BufferedReader(r.raw, block_size=BUFF_SIZE) record = self.loader.parse_record_stream(stream, ensure_http_headers=True) memento_dt = r.headers.get('Memento-Datetime') target_uri = r.headers.get('WARC-Target-URI') # cdx['urlkey'] = urlkey # cdx['timestamp'] = http_date_to_timestamp(memento_dt) # cdx['url'] = target_uri set_content_loc = False # Check if Fuzzy Match if target_uri != wb_url.url and cdx.get('is_fuzzy') == '1': set_content_loc = True # if redir to exact, redir if url or ts are different if self.redirect_to_exact: if (set_content_loc or (wb_url.timestamp != cdx.get('timestamp') and not cdx.get('is_live'))): new_url = urlrewriter.get_new_url(url=target_uri, timestamp=cdx['timestamp'], mod=wb_url.mod) resp = WbResponse.redir_response(new_url, '307 Temporary Redirect') if self.enable_memento: if is_timegate and not is_proxy: self._add_memento_links(target_uri, full_prefix, memento_dt, cdx['timestamp'], resp.status_headers, is_timegate, is_proxy) else: resp.status_headers['Link'] = MementoUtils.make_link( target_uri, 'original') return resp self._add_custom_params(cdx, r.headers, kwargs, record) if self._add_range(record, wb_url, range_start, range_end): wb_url.mod = 'id_' if is_ajax: head_insert_func = None urlrewriter.rewrite_opts['is_ajax'] = True else: top_url = self.get_top_url(full_prefix, wb_url, cdx, kwargs) head_insert_func = (self.head_insert_view.create_insert_func( wb_url, full_prefix, host_prefix, top_url, environ, framed_replay, coll=kwargs.get('coll', ''), replay_mod=self.replay_mod, config=self.config)) cookie_rewriter = None if self.cookie_tracker and cookie_key: # skip add cookie if service worker is not 200 # it seems cookie headers from service workers are not applied, so don't update in cache if wb_url.mod == 'sw_': cookie_key = None cookie_rewriter = self.cookie_tracker.get_rewriter( urlrewriter, cookie_key) urlrewriter.rewrite_opts['ua_string'] = environ.get('HTTP_USER_AGENT') result = content_rw(record, urlrewriter, cookie_rewriter, head_insert_func, cdx, environ) status_headers, gen, is_rw = result if history_page: title = DefaultRewriter._extract_title(gen) if not title: title = unquote(environ.get('HTTP_X_WOMBAT_HISTORY_TITLE', '')) if not title: title = history_page self._add_history_page(cdx, kwargs, title) return WbResponse.json_response({'title': title}) if setcookie_headers: status_headers.headers.extend(setcookie_headers) if ' ' not in status_headers.statusline: status_headers.statusline += ' None' if not is_ajax and self.enable_memento: self._add_memento_links(cdx['url'], full_prefix, memento_dt, cdx['timestamp'], status_headers, is_timegate, is_proxy, cdx.get('source-coll')) set_content_loc = True if set_content_loc and not self.redirect_to_exact: status_headers.headers.append( ('Content-Location', urlrewriter.get_new_url(timestamp=cdx['timestamp'], url=cdx['url']))) if not is_proxy: self.add_csp_header(wb_url, status_headers) response = WbResponse(status_headers, gen) return response
def page_search(self, environ, coll): params = dict(parse_qsl(environ.get('QUERY_STRING'))) result = self.solr_ingester.query_solr(coll, params) return WbResponse.json_response(result)