def check_if_content(self, wb_url, environ): wb_url = WbUrl(wb_url) if (wb_url.is_replay()): environ['is_content'] = True if (self.content_host and not self.is_content_request() and wb_url.mod != '' and not wb_url.mod.startswith('$br:')): self.redir_host(self.content_host)
def check_remote_archive(self, wb_url, mode, wb_url_obj=None): wb_url_obj = wb_url_obj or WbUrl(wb_url) res = self.wam_loader.find_archive_for_url(wb_url_obj.url) if not res: return pk, new_url, id_ = res mode = 'extract:' + id_ new_url = WbUrl(new_url).to_str(mod=wb_url_obj.mod) return mode, new_url
def redir_no_cookie(self): path = request.environ['PATH_INFO'] prefix, wb_url = path[1:].split('/', 1) wb_url = WbUrl(wb_url) print(repr(wb_url)) if wb_url.mod != '': url = '/' + prefix + '/' url += wb_url.to_str(mod='') if request.environ.get('QUERY_STRING'): url += '?' + request.environ['QUERY_STRING'] redirect(url)
def rewrite_record(self, headers, content, ts, url='http://example.com/', prefix='http://localhost:8080/prefix/', warc_headers=None, request_url=None, is_live=None, use_js_proxy=True, environ=None): record = self._create_response_record(url, headers, content, warc_headers) wburl = WbUrl(ts + '/' + (request_url or url)) url_rewriter = UrlRewriter(wburl, prefix) cdx = CDXObject() cdx['url'] = url cdx['timestamp'] = ts cdx['urlkey'] = canonicalize(url) if request_url != url: cdx['is_fuzzy'] = '1' cdx['is_live'] = is_live def insert_func(rule, cdx): return '' if use_js_proxy: rewriter = self.js_proxy_content_rewriter else: rewriter = self.content_rewriter return rewriter(record, url_rewriter, cookie_rewriter=None, head_insert_func=insert_func, cdx=cdx, environ=environ)
def rewrite_record(self, headers, content, ts, url='http://example.com/', prefix='http://localhost:8080/prefix/', warc_headers=None, request_url=None, is_live=None): record = self._create_response_record(url, headers, content, warc_headers) wburl = WbUrl(ts + '/' + (request_url or url)) url_rewriter = UrlRewriter(wburl, prefix) cdx = CDXObject() cdx['url'] = url cdx['timestamp'] = ts cdx['urlkey'] = canonicalize(url) if request_url != url: cdx['is_fuzzy'] = '1' cdx['is_live'] = is_live return self.content_rewriter(record, url_rewriter, None, cdx=cdx)
def __init__(self, wburl, prefix='', full_prefix=None, rel_prefix=None, root_path=None, cookie_scope=None, rewrite_opts=None, pywb_static_prefix=None): self.wburl = wburl if isinstance(wburl, WbUrl) else WbUrl(wburl) self.prefix = prefix self.full_prefix = full_prefix or prefix self.rel_prefix = rel_prefix or prefix self.root_path = root_path or '/' if self.full_prefix and self.full_prefix.startswith(self.PROTOCOLS): self.prefix_scheme = self.full_prefix.split(':')[0] else: self.prefix_scheme = None self.prefix_abs = self.prefix and self.prefix.startswith( self.PROTOCOLS) self.cookie_scope = cookie_scope self.rewrite_opts = rewrite_opts or {} self._pywb_static_prefix = pywb_static_prefix if self.rewrite_opts.get('punycode_links'): self.wburl._do_percent_encode = False
def rebase_rewriter(self, new_url): if new_url.startswith(self.prefix): new_url = new_url[len(self.prefix):] elif new_url.startswith(self.rel_prefix): new_url = new_url[len(self.rel_prefix):] new_wburl = WbUrl(new_url) return self._create_rebased_rewriter(new_wburl, self.prefix)
def rebase_rewriter(self, base_url): if not base_url.startswith(self.PROTOCOLS): base_url = self.urljoin(self.wburl.url, base_url) new_wburl_str = self.wburl.to_str(url=base_url) new_wburl = WbUrl(new_wburl_str) return self._create_rebased_rewriter(new_wburl, self.prefix)
def __init__(self, orig_url): import re import six from six.moves.urllib.parse import urlsplit, urlunsplit from six.moves.urllib.parse import quote_plus, quote, unquote_plus from pywb.utils.loaders import to_native_str from pywb.rewrite.wburl import WbUrl pywb.rewrite.wburl.BaseWbUrl.__init__(self) if six.PY2 and isinstance(orig_url, six.text_type): orig_url = orig_url.encode('utf-8') orig_url = quote(orig_url) self._original_url = orig_url if not self._init_query(orig_url): if not self._init_replay(orig_url): raise Exception('Invalid WbUrl: ', orig_url) new_uri = WbUrl.to_uri(self.url) self._do_percent_encode = True self.url = new_uri # begin brozzler changes if (self.url.startswith('urn:') or self.url.startswith('screenshot:') or self.url.startswith('thumbnail:')): return # end brozzler changes # protocol agnostic url -> http:// # no protocol -> http:// #inx = self.url.find('://') inx = -1 m = self.SCHEME_RX.match(self.url) if m: inx = m.span(1)[0] #if inx < 0: # check for other partially encoded variants # m = self.PARTIAL_ENC_RX.match(self.url) # if m: # len_ = len(m.group(0)) # self.url = (urllib.unquote_plus(self.url[:len_]) + # self.url[len_:]) # inx = self.url.find(':/') if inx < 0: self.url = self.DEFAULT_SCHEME + self.url else: inx += 2 if inx < len(self.url) and self.url[inx] != '/': self.url = self.url[:inx] + '/' + self.url[inx:]
def unrewrite_referrer(self, environ, full_prefix): referrer = environ.get('HTTP_REFERER') if not referrer: return False if referrer.startswith(full_prefix): referrer = referrer[len(full_prefix):] if referrer: environ['HTTP_REFERER'] = WbUrl(referrer).url return True return False
def make_timemap(wbrequest, cdx_lines): prefix = wbrequest.wb_prefix url = wbrequest.wb_url.url mod = wbrequest.options.get('replay_mod', '') # get first memento as it'll be used for 'from' field try: first_cdx = six.next(cdx_lines) from_date = timestamp_to_http_date(first_cdx['timestamp']) except StopIteration: first_cdx = None if first_cdx: # timemap link timemap = ('<{0}>; rel="self"; ' + 'type="application/link-format"; from="{1}",\n') yield timemap.format(prefix + wbrequest.wb_url.to_str(), from_date) # original link original = '<{0}>; rel="original",\n' yield original.format(url) # timegate link timegate = '<{0}>; rel="timegate",\n' timegate_url = WbUrl.to_wburl_str(url=url, mod=mod, type=WbUrl.LATEST_REPLAY) yield timegate.format(prefix + timegate_url) if not first_cdx: # terminating timemap link, no from timemap = ('<{0}>; rel="self"; type="application/link-format"') yield timemap.format(prefix + wbrequest.wb_url.to_str()) return # first memento link yield make_timemap_memento_link(first_cdx, prefix, datetime=from_date, mod=mod) prev_cdx = None for cdx in cdx_lines: if prev_cdx: yield make_timemap_memento_link(prev_cdx, prefix, mod=mod) prev_cdx = cdx # last memento link, if any if prev_cdx: yield make_timemap_memento_link(prev_cdx, prefix, end='', mod=mod)
def make_timemap(wbrequest, cdx_lines): prefix = wbrequest.wb_prefix url = wbrequest.wb_url.url mod = wbrequest.options.get('replay_mod', '') # get first memento as it'll be used for 'from' field try: first_cdx = six.next(cdx_lines) from_date = timestamp_to_http_date(first_cdx['timestamp']) except StopIteration: first_cdx = None if first_cdx: # timemap link timemap = ('<{0}>; rel="self"; ' + 'type="application/link-format"; from="{1}",\n') yield timemap.format(prefix + wbrequest.wb_url.to_str(), from_date) # original link original = '<{0}>; rel="original",\n' yield original.format(url) # timegate link timegate = '<{0}>; rel="timegate",\n' timegate_url= WbUrl.to_wburl_str(url=url, mod=mod, type=WbUrl.LATEST_REPLAY) yield timegate.format(prefix + timegate_url) if not first_cdx: # terminating timemap link, no from timemap = ('<{0}>; rel="self"; type="application/link-format"') yield timemap.format(prefix + wbrequest.wb_url.to_str()) return # first memento link yield make_timemap_memento_link(first_cdx, prefix, datetime=from_date, mod=mod) prev_cdx = None for cdx in cdx_lines: if prev_cdx: yield make_timemap_memento_link(prev_cdx, prefix, mod=mod) prev_cdx = cdx # last memento link, if any if prev_cdx: yield make_timemap_memento_link(prev_cdx, prefix, end='', mod=mod)
def make_memento_link(cdx, prefix, datetime=None, rel='memento', end=',\n'): memento = '<{0}>; rel="{1}"; datetime="{2}"' + end string = WbUrl.to_wburl_str(url=cdx['original'], mod='', timestamp=cdx['timestamp'], type=WbUrl.REPLAY) url = prefix + string if not datetime: datetime = timestamp_to_http_date(cdx['timestamp']) return memento.format(url, rel, datetime)
def handle_routing(self, wb_url, user, coll, rec, type, is_embed=False, is_display=False, sources='', inv_sources='', redir_route=None): wb_url = self.add_query(wb_url) if user == '_new' and redir_route: return self.do_create_new_and_redir(coll, rec, wb_url, redir_route) sesh = self.get_session() if sesh.is_new() and self.is_content_request(): self.redir_set_session() remote_ip = None frontend_cache_header = None patch_rec = '' if type in self.MODIFY_MODES: if not self.manager.has_recording(user, coll, rec): self._redir_if_sanitized(self.sanitize_title(rec), rec, wb_url) # don't auto create recording for inner frame w/o accessing outer frame raise HTTPError(404, 'No Such Recording') elif not self.manager.is_recording_open(user, coll, rec): # force creation of new recording as this one is closed raise HTTPError(404, 'Recording not open') self.manager.assert_can_write(user, coll) if self.manager.is_out_of_space(user): raise HTTPError(402, 'Out of Space') remote_ip = self._get_remote_ip() if self.manager.is_rate_limited(user, remote_ip): raise HTTPError(402, 'Rate Limit') if inv_sources and inv_sources != '*': patch_rec = self.patch_of_name(rec, True) if type == 'replay-coll': res = self.manager.has_collection_is_public(user, coll) if not res: self._redir_if_sanitized(self.sanitize_title(coll), coll, wb_url) raise HTTPError(404, 'No Such Collection') if res != 'public': frontend_cache_header = ('Cache-Control', 'private') elif type == 'replay': if not self.manager.has_recording(user, coll, rec): raise HTTPError(404, 'No Such Recording') request.environ['SCRIPT_NAME'] = quote(request.environ['SCRIPT_NAME'], safe='/:') wb_url = self._context_massage(wb_url) wb_url_obj = WbUrl(wb_url) is_top_frame = (wb_url_obj.mod == self.frame_mod or wb_url_obj.mod.startswith('$br:')) if type == 'record' and is_top_frame: result = self.check_remote_archive(wb_url, type, wb_url_obj) if result: mode, wb_url = result new_url = '/{user}/{coll}/{rec}/{mode}/{url}'.format(user=user, coll=coll, rec=rec, mode=mode, url=wb_url) return self.redirect(new_url) elif type == 'replay-coll' and not is_top_frame: self.manager.sync_coll_index(user, coll, exists=False, do_async=False) kwargs = dict(user=user, coll_orig=coll, id=sesh.get_id(), rec_orig=rec, coll=quote(coll), rec=quote(rec, safe='/*'), type=type, sources=sources, inv_sources=inv_sources, patch_rec=patch_rec, ip=remote_ip, is_embed=is_embed, is_display=is_display, use_js_obj_proxy=True) try: self.check_if_content(wb_url_obj, request.environ, is_top_frame) resp = self.render_content(wb_url, kwargs, request.environ) if frontend_cache_header: resp.status_headers.headers.append(frontend_cache_header) resp = HTTPResponse(body=resp.body, status=resp.status_headers.statusline, headers=resp.status_headers.headers) return resp except UpstreamException as ue: @self.jinja2_view('content_error.html') def handle_error(status_code, type, url, err_info): response.status = status_code return {'url': url, 'status': status_code, 'error': err_info.get('error'), 'user': self.get_view_user(user), 'coll': coll, 'rec': rec, 'type': type, 'app_host': self.app_host, } return handle_error(ue.status_code, type, ue.url, ue.msg)
def handle_routing(self, wb_url, user, coll, rec, type, is_embed=False, is_display=False, sources='', inv_sources='', redir_route=None): wb_url = self.add_query(wb_url) if user == '_new' and redir_route: return self.do_create_new_and_redir(coll, rec, wb_url, redir_route) not_found = False sesh = self.get_session() if sesh.is_new() and self.is_content_request(): self.redir_set_session() remote_ip = None if type == 'replay' or type in self.MODIFY_MODES: if not self.manager.has_recording(user, coll, rec): not_found = True if type != 'replay': self.manager.assert_can_write(user, coll) if self.manager.is_out_of_space(user): raise HTTPError(402, 'Out of Space') remote_ip = self._get_remote_ip() if self.manager.is_rate_limited(user, remote_ip): raise HTTPError(402, 'Rate Limit') if ((not_found or type == 'replay-coll') and (not (self.manager.is_anon(user) and coll == 'temp')) and (not self.manager.has_collection(user, coll))): self._redir_if_sanitized(self.sanitize_title(coll), coll, wb_url) raise HTTPError(404, 'No Such Collection') if not_found: title = rec if type in self.MODIFY_MODES: rec = self._create_new_rec(user, coll, title, type, no_dupe=True) self._redir_if_sanitized(rec, title, wb_url) if type == 'replay': raise HTTPError(404, 'No Such Recording') patch_rec = '' if inv_sources and inv_sources != '*': patch_rec = self._create_new_rec(user, coll, 'Patch of ' + rec, mode='patch', no_dupe=True) request.environ['SCRIPT_NAME'] = quote(request.environ['SCRIPT_NAME'], safe='/:') wb_url = self._context_massage(wb_url) wb_url_obj = WbUrl(wb_url) kwargs = dict(user=user, coll_orig=coll, id=sesh.get_id(), rec_orig=rec, coll=quote(coll), rec=quote(rec, safe='/*'), type=type, sources=sources, inv_sources=inv_sources, patch_rec=patch_rec, ip=remote_ip, is_embed=is_embed, is_display=is_display) try: self.check_if_content(wb_url_obj, request.environ) resp = self.render_content(wb_url, kwargs, request.environ) self.add_csp_header(wb_url_obj, resp.status_headers) resp = HTTPResponse(body=resp.body, status=resp.status_headers.statusline, headers=resp.status_headers.headers) return resp except UpstreamException as ue: @self.jinja2_view('content_error.html') def handle_error(status_code, type, url, err_info): response.status = status_code return { 'url': url, 'status': status_code, 'error': err_info.get('error'), 'user': self.get_view_user(user), 'coll': coll, 'rec': rec, 'type': type, 'app_host': self.app_host, } return handle_error(ue.status_code, type, ue.url, ue.msg)
def __call__(self, env, the_router): referrer = env.get('HTTP_REFERER') routes = the_router.routes # ensure there is a referrer if referrer is None: return None # get referrer path name ref_split = urlsplit(referrer) # require that referrer starts with current Host, if any curr_host = env.get('HTTP_HOST') if curr_host and curr_host != ref_split.netloc: return None path = ref_split.path app_path = env.get('SCRIPT_NAME', '') if app_path: # must start with current app name, if not root if not path.startswith(app_path): return None path = path[len(app_path):] ref_route = None ref_request = None for route in routes: matcher, coll = route.is_handling(path) if matcher: ref_request = the_router.parse_request(route, env, matcher, coll, path) ref_route = route break # must have matched one of the routes with a urlrewriter if not ref_request or not ref_request.urlrewriter: return None rewriter = ref_request.urlrewriter rel_request_uri = env['REL_REQUEST_URI'] timestamp_path = '/' + rewriter.wburl.timestamp + '/' # check if timestamp is already part of the path if rel_request_uri.startswith(timestamp_path): # remove timestamp but leave / to make host relative url # 2013/path.html -> /path.html rel_request_uri = rel_request_uri[len(timestamp_path) - 1:] rewritten_url = rewriter.rewrite(rel_request_uri) # if post, can't redirect as that would lost the post data # (can't use 307 because FF will show confirmation warning) if ref_request.method == 'POST': new_wb_url = WbUrl(rewritten_url[len(rewriter.prefix):]) ref_request.wb_url.url = new_wb_url.url return ref_route.handler(ref_request) final_url = urlunsplit( (ref_split.scheme, ref_split.netloc, rewritten_url, '', '')) return WbResponse.redir_response(final_url, status='302 Temp Redirect')
def render_content(self, wbrequest): if wbrequest.wb_url.mod == 'vi_': return self._get_video_info(wbrequest) head_insert_func = self.head_insert_view.create_insert_func(wbrequest) req_headers = self._live_request_headers(wbrequest) ref_wburl_str = wbrequest.extract_referrer_wburl_str() if ref_wburl_str: wbrequest.env['REL_REFERER'] = WbUrl(ref_wburl_str).url ignore_proxies = False use_206 = False url = None rangeres = None readd_range = False cache_key = None if self.proxies: rangeres = wbrequest.extract_range() if rangeres: url, start, end, use_206 = rangeres # if bytes=0- Range request, # simply remove the range and still proxy if start == 0 and not end and use_206: wbrequest.wb_url.url = url del wbrequest.env['HTTP_RANGE'] readd_range = True else: # disables proxy ignore_proxies = True # sets cache_key only if not already cached cache_key = self._get_cache_key('r:', url) result = self.rewriter.fetch_request(wbrequest.wb_url.url, wbrequest.urlrewriter, head_insert_func=head_insert_func, req_headers=req_headers, env=wbrequest.env, ignore_proxies=ignore_proxies, verify=self.verify) wbresponse = self._make_response(wbrequest, *result) if readd_range: content_length = ( wbresponse.status_headers.get_header('Content-Length')) try: content_length = int(content_length) wbresponse.status_headers.add_range(0, content_length, content_length) except (ValueError, TypeError): pass if cache_key: self._add_proxy_ping(cache_key, url, wbrequest, wbresponse) if rangeres: referrer = wbrequest.env.get('REL_REFERER') # also ping video info if referrer: try: resp = self._get_video_info(wbrequest, info_url=referrer, video_url=url) except: print('Error getting video info') return wbresponse
def handle_magic_page(self, env): request_url = env['REL_REQUEST_URI'] parts = urlparse.urlsplit(request_url) server_name = env['pywb.proxy_host'] path_url = parts.path[1:] if parts.query: path_url += '?' + parts.query if server_name.startswith('auto'): coll, ts, sesh_id = self.get_coll(env) if coll: return self.make_sethost_cookie_response(sesh_id, path_url, env) else: return self.make_magic_response('select', path_url, env) elif server_name.startswith('query.'): wb_url = WbUrl(path_url) # only dealing with specific timestamp setting if wb_url.is_query(): return None coll, ts, sesh_id = self.get_coll(env) if not coll: return self.make_magic_response('select', path_url, env) self.set_ts(sesh_id, wb_url.timestamp) return self.make_redir_response(wb_url.url) elif server_name.endswith(self.set_prefix): old_sesh_id = extract_client_cookie(env, self.cookie_name) sesh_id = self.create_renew_sesh_id(old_sesh_id) if sesh_id != old_sesh_id: headers = self.make_cookie_headers(sesh_id, self.magic_name) else: headers = None coll = server_name[:-len(self.set_prefix)] # set sesh value self.set_coll(sesh_id, coll) return self.make_sethost_cookie_response(sesh_id, path_url, env, headers=headers) elif self.sethost_prefix in server_name: inx = server_name.find(self.sethost_prefix) sesh_id = server_name[:inx] domain = server_name[inx + len(self.sethost_prefix):] headers = self.make_cookie_headers(sesh_id, domain) full_url = env['pywb.proxy_scheme'] + '://' + domain full_url += '/' + path_url return self.make_redir_response(full_url, headers=headers) elif 'select.' in server_name: coll, ts, sesh_id = self.get_coll(env) route_temp = '-set.' + self.magic_name + '/' + path_url return (self.proxy_select_view. render_response(routes=self.routes, route_temp=route_temp, coll=coll, url=path_url))
def handle_routing(self, wb_url, user, coll_name, rec_name, type, is_embed=False, is_display=False, sources='', inv_sources='', redir_route=None): wb_url = self._full_url(wb_url) if user == '_new' and redir_route: return self.do_create_new_and_redir(coll_name, rec_name, wb_url, redir_route) sesh = self.get_session() remote_ip = None frontend_cache_header = None patch_recording = None the_user, collection, recording = self.user_manager.get_user_coll_rec( user, coll_name, rec_name) if not the_user: msg = 'not_found' if user == 'api' else 'no_such_user' self._raise_error(404, msg) coll = collection.my_id if collection else None rec = recording.my_id if recording else None if type in self.MODIFY_MODES: if sesh.is_new() and self.is_content_request(): self.redir_set_session() if not recording: self._redir_if_sanitized(self.sanitize_title(rec_name), rec_name, wb_url) # don't auto create recording for inner frame w/o accessing outer frame self._raise_error(404, 'no_such_recording') elif not recording.is_open(): # force creation of new recording as this one is closed self._raise_error(400, 'recording_not_open') collection.access.assert_can_write_coll(collection) if the_user.is_out_of_space(): self._raise_error(402, 'out_of_space') remote_ip = self._get_remote_ip() remote_ip = self.check_rate_limit(the_user, remote_ip) if inv_sources and inv_sources != '*': #patch_rec_name = self.patch_of_name(rec, True) patch_recording = recording.get_patch_recording() #patch_recording = collection.get_recording_by_name(patch_rec_name) if type in ('replay-coll', 'replay'): if not collection: self._redir_if_sanitized(self.sanitize_title(coll_name), coll_name, wb_url) if sesh.is_new() and self.is_content_request(): self.redir_set_session() else: self._raise_error(404, 'no_such_collection') access = self.access.check_read_access_public(collection) if not access: if sesh.is_new() and self.is_content_request(): self.redir_set_session() else: self._raise_error(404, 'no_such_collection') if access != 'public': frontend_cache_header = ('Cache-Control', 'private') if type == 'replay': if not recording: self._raise_error(404, 'no_such_recording') request.environ['SCRIPT_NAME'] = quote(request.environ['SCRIPT_NAME'], safe='/:') wb_url = self._context_massage(wb_url) wb_url_obj = WbUrl(wb_url) is_top_frame = (wb_url_obj.mod == self.frame_mod or wb_url_obj.mod.startswith('$br:')) if type == 'record' and is_top_frame: result = self.check_remote_archive(wb_url, type, wb_url_obj) if result: mode, wb_url = result new_url = '/{user}/{coll}/{rec}/{mode}/{url}'.format( user=user, coll=coll_name, rec=rec_name, mode=mode, url=wb_url) return self.redirect(new_url) elif type == 'replay-coll' and not is_top_frame: collection.sync_coll_index(exists=False, do_async=False) kwargs = dict( user=user, id=sesh.get_id(), coll=coll, rec=rec, coll_name=quote(coll_name), rec_name=quote(rec_name, safe='/*'), the_user=the_user, collection=collection, recording=recording, patch_recording=patch_recording, type=type, sources=sources, inv_sources=inv_sources, patch_rec=patch_recording.my_id if patch_recording else None, ip=remote_ip, is_embed=is_embed, is_display=is_display) # top-frame replay but through a proxy, redirect to original if is_top_frame and 'wsgiprox.proxy_host' in request.environ: kwargs['url'] = wb_url_obj.url kwargs['request_ts'] = wb_url_obj.timestamp self.browser_mgr.update_local_browser(kwargs) return redirect(wb_url_obj.url) try: self.check_if_content(wb_url_obj, request.environ, is_top_frame) request.environ['pywb.static_prefix'] = self.BUNDLE_PREFIX # BEGIN PERMA CUSTOMIZATIONS try: resp = self.render_content(wb_url, kwargs, request.environ) except UpstreamException as ue: if ue.status_code == 404: # Retry all 404s after 1s, in a broad effort to allay # https://github.com/harvard-lil/perma/issues/2633 # until I pinpoint the real problem. time.sleep(self.sleep_on_404) resp = self.render_content(wb_url, kwargs, request.environ) else: raise # END PERMA CUSTOMIZATIONS if frontend_cache_header: resp.status_headers.headers.append(frontend_cache_header) # BEGIN PERMA CUSTOMIZATIONS # make sure all headers can be encoded in ascii # workaround for https://github.com/harvard-lil/perma/issues/2603 # technique adapted from https://github.com/webrecorder/warcio/blob/master/warcio/statusandheaders.py#L183 headers = resp.status_headers.headers for i, header in enumerate(headers[:]): try: header[1].encode('ascii') except UnicodeEncodeError: escaped = ENCODE_HEADER_RX.sub(do_encode, header[1]) if escaped == header[1]: escaped = quote(escaped) headers[i] = (header[0], escaped) resp = HTTPResponse(body=resp.body, status=resp.status_headers.statusline, headers=headers) # END PERMA CUSTOMIZATIONS return resp except UpstreamException as ue: err_context = { 'url': ue.url, 'status': ue.status_code, 'error': ue.msg.get('error'), 'timestamp': wb_url_obj.timestamp if wb_url_obj else '', 'user': user, 'coll': coll_name, 'rec': rec_name, 'type': type, 'app_host': self.app_host, } @self.jinja2_view('content_error.html') def handle_error(error): response.status = ue.status_code return error if self.content_error_redirect: return redirect(self.content_error_redirect + '?' + urlencode(err_context), code=307) else: return handle_error(err_context)
def process_record(self, record, flow): headers = flow.response.headers url = flow.request.req_url scheme = flow.request.req_scheme if not self.content_rewriter: return record.http_headers, StreamIO(record.raw_stream) cookie_rewriter = None template_params = flow.extra_data environ = { 'pywb_proxy_magic': self.proxy_magic, 'webrec.template_params': template_params } wb_url = WbUrl(url) wb_prefix = '' host_prefix = flow.request.req_scheme + '://' + self.proxy_magic urlrewriter = SchemeOnlyUrlRewriter(wb_url, '') if flow.request.headers.get('X-Requested-With', '').lower() == 'xmlhttprequest': urlrewriter.rewrite_opts['is_ajax'] = True head_insert_func = (self.head_insert_view.create_insert_func( wb_url, wb_prefix, host_prefix, url, environ, False)) urlkey = canonicalize(wb_url.url) cdx = CDXObject() cdx['urlkey'] = urlkey cdx['timestamp'] = http_date_to_timestamp( headers.get('Memento-Datetime')) cdx['url'] = wb_url.url if headers.get('Webagg-Source-Coll') == 'live': cdx['is_live'] = 'true' result = self.content_rewriter.rewrite_content( urlrewriter, record.http_headers, record.raw_stream, head_insert_func, urlkey, cdx, cookie_rewriter, environ) status_headers, gen, is_rw = result status_headers.remove_header('Content-Security-Policy') # check for content-length res = status_headers.get_header('content-length') try: if int(res) > 0: return status_headers, IterIdent(gen) except: pass # need to either chunk or buffer to get content-length if flow.request.http_version == 'HTTP/1.1': status_headers.remove_header('content-length') status_headers.headers.append(('Transfer-Encoding', 'chunked')) #gen = chunk_encode_iter(gen) else: gen = buffer_iter(status_headers, gen) return status_headers, IterIdent(gen)
def render_content(self, wb_url, kwargs, environ): wb_url = WbUrl(wb_url) host_prefix = self.get_host_prefix(environ) rel_prefix = self.get_rel_prefix(environ) full_prefix = host_prefix + rel_prefix resp = self.handle_custom_response(environ, wb_url, full_prefix, host_prefix, kwargs) if resp is not None: content_type = 'text/html' # if not replay outer frame, specify utf-8 charset if not self.is_framed_replay(wb_url): content_type += '; charset=utf-8' return WbResponse.text_response(resp, content_type=content_type) urlrewriter = UrlRewriter(wb_url, prefix=full_prefix, full_prefix=full_prefix, rel_prefix=rel_prefix) self.unrewrite_referrer(environ) urlkey = canonicalize(wb_url.url) inputreq = RewriteInputRequest(environ, urlkey, wb_url.url, self.content_rewriter) inputreq.include_post_query(wb_url.url) mod_url = None use_206 = False rangeres = None readd_range = False async_record_url = None if kwargs.get('type') in ('record', 'patch'): rangeres = inputreq.extract_range() if rangeres: mod_url, start, end, use_206 = rangeres # if bytes=0- Range request, # simply remove the range and still proxy if start == 0 and not end and use_206: wb_url.url = mod_url inputreq.url = mod_url del environ['HTTP_RANGE'] readd_range = True else: async_record_url = mod_url skip = async_record_url is not None setcookie_headers = None if self.cookie_tracker: cookie_key = self.get_cookie_key(kwargs) res = self.cookie_tracker.get_cookie_headers(wb_url.url, cookie_key) inputreq.extra_cookie, setcookie_headers = res r = self._do_req(inputreq, wb_url, kwargs, skip) if r.status_code >= 400: error = None try: error = r.raw.read() r.raw.close() except: pass if error: error = error.decode('utf-8') else: error = '' details = dict(args=kwargs, error=error) raise UpstreamException(r.status_code, url=wb_url.url, details=details) if async_record_url: environ.pop('HTTP_RANGE', '') gevent.spawn(self._do_async_req, inputreq, async_record_url, wb_url, kwargs, False) record = self.loader.parse_record_stream(r.raw) cdx = CDXObject() cdx['urlkey'] = urlkey cdx['timestamp'] = http_date_to_timestamp(r.headers.get('Memento-Datetime')) cdx['url'] = wb_url.url self._add_custom_params(cdx, r.headers, kwargs) if readd_range: content_length = (record.status_headers. get_header('Content-Length')) try: content_length = int(content_length) record.status_headers.add_range(0, content_length, content_length) except (ValueError, TypeError): pass if self.is_ajax(environ): head_insert_func = None urlrewriter.rewrite_opts['is_ajax'] = True else: top_url = self.get_top_url(full_prefix, wb_url, cdx, kwargs) head_insert_func = (self.head_insert_view. create_insert_func(wb_url, full_prefix, host_prefix, top_url, environ, self.framed_replay)) cookie_rewriter = None if self.cookie_tracker: cookie_rewriter = self.cookie_tracker.get_rewriter(urlrewriter, cookie_key) result = self.content_rewriter.rewrite_content(urlrewriter, record.status_headers, record.stream, head_insert_func, urlkey, cdx, cookie_rewriter, environ) status_headers, gen, is_rw = result if setcookie_headers: status_headers.headers.extend(setcookie_headers) return WbResponse(status_headers, gen)
def render_content(self, wb_url, kwargs, environ): wb_url = wb_url.replace('#', '%23') wb_url = WbUrl(wb_url) proto = environ.get('HTTP_X_FORWARDED_PROTO', self.force_scheme) if proto: environ['wsgi.url_scheme'] = proto history_page = environ.pop('HTTP_X_WOMBAT_HISTORY_PAGE', '') if history_page: wb_url.url = history_page is_ajax = True else: is_ajax = self.is_ajax(environ) is_timegate = self._check_accept_dt(wb_url, environ) host_prefix = self.get_host_prefix(environ) rel_prefix = self.get_rel_prefix(environ) full_prefix = host_prefix + rel_prefix environ['pywb.host_prefix'] = host_prefix pywb_static_prefix = host_prefix + environ.get( 'pywb.app_prefix', '') + environ.get('pywb.static_prefix', '/static/') is_proxy = ('wsgiprox.proxy_host' in environ) response = self.handle_custom_response(environ, wb_url, full_prefix, host_prefix, kwargs) if response: return self.format_response(response, wb_url, full_prefix, is_timegate, is_proxy) if is_proxy: environ['pywb_proxy_magic'] = environ['wsgiprox.proxy_host'] urlrewriter = IdentityUrlRewriter(wb_url, '') framed_replay = False else: urlrewriter = UrlRewriter(wb_url, prefix=full_prefix, full_prefix=full_prefix, rel_prefix=rel_prefix, pywb_static_prefix=pywb_static_prefix) framed_replay = self.framed_replay url_parts = urlsplit(wb_url.url) if not url_parts.path: return self.send_redirect('/', url_parts, urlrewriter) self.unrewrite_referrer(environ, full_prefix) urlkey = canonicalize(wb_url.url) if self.use_js_obj_proxy: content_rw = self.js_proxy_rw else: content_rw = self.default_rw inputreq = RewriteInputRequest(environ, urlkey, wb_url.url, content_rw) inputreq.include_method_query(wb_url.url) range_start, range_end, skip_record = self._check_range( inputreq, wb_url) setcookie_headers = None cookie_key = None if self.cookie_tracker: cookie_key = self.get_cookie_key(kwargs) if cookie_key: res = self.cookie_tracker.get_cookie_headers( wb_url.url, urlrewriter, cookie_key, environ.get('HTTP_COOKIE', '')) inputreq.extra_cookie, setcookie_headers = res r = self._do_req(inputreq, wb_url, kwargs, skip_record) if r.status_code >= 400: error = None try: error = r.raw.read() except Exception: pass finally: no_except_close(r.raw) if error: error = error.decode('utf-8') else: error = '' details = dict(args=kwargs, error=error) raise UpstreamException(r.status_code, url=wb_url.url, details=details) cdx = CDXObject(r.headers.get('Warcserver-Cdx').encode('utf-8')) cdx_url_parts = urlsplit(cdx['url']) if cdx_url_parts.path.endswith( '/') and not url_parts.path.endswith('/'): # add trailing slash new_path = url_parts.path + '/' no_except_close(r.raw) return self.send_redirect(new_path, url_parts, urlrewriter) stream = BufferedReader(r.raw, block_size=BUFF_SIZE) record = self.loader.parse_record_stream(stream, ensure_http_headers=True) memento_dt = r.headers.get('Memento-Datetime') target_uri = r.headers.get('WARC-Target-URI') # cdx['urlkey'] = urlkey # cdx['timestamp'] = http_date_to_timestamp(memento_dt) # cdx['url'] = target_uri set_content_loc = False # Check if Fuzzy Match if target_uri != wb_url.url and cdx.get('is_fuzzy') == '1': set_content_loc = True # if redir to exact, redir if url or ts are different if self.redirect_to_exact: if (set_content_loc or (wb_url.timestamp != cdx.get('timestamp') and not cdx.get('is_live'))): new_url = urlrewriter.get_new_url(url=target_uri, timestamp=cdx['timestamp'], mod=wb_url.mod) resp = WbResponse.redir_response(new_url, '307 Temporary Redirect') if self.enable_memento: if is_timegate and not is_proxy: self._add_memento_links(target_uri, full_prefix, memento_dt, cdx['timestamp'], resp.status_headers, is_timegate, is_proxy) else: resp.status_headers['Link'] = MementoUtils.make_link( target_uri, 'original') return resp self._add_custom_params(cdx, r.headers, kwargs, record) if self._add_range(record, wb_url, range_start, range_end): wb_url.mod = 'id_' if is_ajax: head_insert_func = None urlrewriter.rewrite_opts['is_ajax'] = True else: top_url = self.get_top_url(full_prefix, wb_url, cdx, kwargs) head_insert_func = (self.head_insert_view.create_insert_func( wb_url, full_prefix, host_prefix, top_url, environ, framed_replay, coll=kwargs.get('coll', ''), replay_mod=self.replay_mod, config=self.config)) cookie_rewriter = None if self.cookie_tracker and cookie_key: # skip add cookie if service worker is not 200 # it seems cookie headers from service workers are not applied, so don't update in cache if wb_url.mod == 'sw_': cookie_key = None cookie_rewriter = self.cookie_tracker.get_rewriter( urlrewriter, cookie_key) urlrewriter.rewrite_opts['ua_string'] = environ.get('HTTP_USER_AGENT') result = content_rw(record, urlrewriter, cookie_rewriter, head_insert_func, cdx, environ) status_headers, gen, is_rw = result if history_page: title = DefaultRewriter._extract_title(gen) if not title: title = unquote(environ.get('HTTP_X_WOMBAT_HISTORY_TITLE', '')) if not title: title = history_page self._add_history_page(cdx, kwargs, title) return WbResponse.json_response({'title': title}) if setcookie_headers: status_headers.headers.extend(setcookie_headers) if ' ' not in status_headers.statusline: status_headers.statusline += ' None' if not is_ajax and self.enable_memento: self._add_memento_links(cdx['url'], full_prefix, memento_dt, cdx['timestamp'], status_headers, is_timegate, is_proxy, cdx.get('source-coll')) set_content_loc = True if set_content_loc and not self.redirect_to_exact: status_headers.headers.append( ('Content-Location', urlrewriter.get_new_url(timestamp=cdx['timestamp'], url=cdx['url']))) if not is_proxy: self.add_csp_header(wb_url, status_headers) response = WbResponse(status_headers, gen) return response
def render_content(self, wb_url, kwargs, environ): wb_url = wb_url.replace('#', '%23') wb_url = WbUrl(wb_url) is_timegate = self._check_accept_dt(wb_url, environ) host_prefix = self.get_host_prefix(environ) rel_prefix = self.get_rel_prefix(environ) full_prefix = host_prefix + rel_prefix is_proxy = ('wsgiprox.proxy_host' in environ) response = self.handle_custom_response(environ, wb_url, full_prefix, host_prefix, kwargs) if response: return self.format_response(response, wb_url, full_prefix, is_timegate, is_proxy) if is_proxy: environ['pywb_proxy_magic'] = environ['wsgiprox.proxy_host'] urlrewriter = IdentityUrlRewriter(wb_url, '') framed_replay = False else: urlrewriter = UrlRewriter(wb_url, prefix=full_prefix, full_prefix=full_prefix, rel_prefix=rel_prefix) framed_replay = self.framed_replay url_parts = urlsplit(wb_url.url) if not url_parts.path: scheme, netloc, path, query, frag = url_parts path = '/' url = urlunsplit((scheme, netloc, path, query, frag)) resp = WbResponse.redir_response(urlrewriter.rewrite(url), '307 Temporary Redirect') if self.enable_memento: resp.status_headers['Link'] = MementoUtils.make_link(url, 'original') return resp self.unrewrite_referrer(environ, full_prefix) urlkey = canonicalize(wb_url.url) if self.use_js_obj_proxy: content_rw = self.js_proxy_rw else: content_rw = self.default_rw inputreq = RewriteInputRequest(environ, urlkey, wb_url.url, content_rw) inputreq.include_method_query(wb_url.url) range_start, range_end, skip_record = self._check_range(inputreq, wb_url) setcookie_headers = None if self.cookie_tracker: cookie_key = self.get_cookie_key(kwargs) res = self.cookie_tracker.get_cookie_headers(wb_url.url, urlrewriter, cookie_key) inputreq.extra_cookie, setcookie_headers = res r = self._do_req(inputreq, wb_url, kwargs, skip_record) if r.status_code >= 400: error = None try: error = r.raw.read() r.raw.close() except: pass if error: error = error.decode('utf-8') else: error = '' details = dict(args=kwargs, error=error) raise UpstreamException(r.status_code, url=wb_url.url, details=details) stream = BufferedReader(r.raw, block_size=BUFF_SIZE) record = self.loader.parse_record_stream(stream, ensure_http_headers=True) memento_dt = r.headers.get('Memento-Datetime') target_uri = r.headers.get('WARC-Target-URI') cdx = CDXObject(r.headers.get('Warcserver-Cdx').encode('utf-8')) #cdx['urlkey'] = urlkey #cdx['timestamp'] = http_date_to_timestamp(memento_dt) #cdx['url'] = target_uri set_content_loc = False # Check if Fuzzy Match if target_uri != wb_url.url and cdx.get('is_fuzzy') == '1': set_content_loc = True # if redir to exact, redir if url or ts are different if self.redirect_to_exact: if (set_content_loc or (wb_url.timestamp != cdx.get('timestamp') and not cdx.get('is_live'))): new_url = urlrewriter.get_new_url(url=target_uri, timestamp=cdx['timestamp'], mod=wb_url.mod) resp = WbResponse.redir_response(new_url, '307 Temporary Redirect') if self.enable_memento: if is_timegate and not is_proxy: self._add_memento_links(target_uri, full_prefix, memento_dt, cdx['timestamp'], resp.status_headers, is_timegate, is_proxy) else: resp.status_headers['Link'] = MementoUtils.make_link(target_uri, 'original') return resp self._add_custom_params(cdx, r.headers, kwargs) if self._add_range(record, wb_url, range_start, range_end): wb_url.mod = 'id_' is_ajax = self.is_ajax(environ) if is_ajax: head_insert_func = None urlrewriter.rewrite_opts['is_ajax'] = True else: top_url = self.get_top_url(full_prefix, wb_url, cdx, kwargs) head_insert_func = (self.head_insert_view. create_insert_func(wb_url, full_prefix, host_prefix, top_url, environ, framed_replay, config=self.config)) cookie_rewriter = None if self.cookie_tracker: cookie_rewriter = self.cookie_tracker.get_rewriter(urlrewriter, cookie_key) urlrewriter.rewrite_opts['ua_string'] = environ.get('HTTP_USER_AGENT') result = content_rw(record, urlrewriter, cookie_rewriter, head_insert_func, cdx) status_headers, gen, is_rw = result if setcookie_headers: status_headers.headers.extend(setcookie_headers) if ' ' not in status_headers.statusline: status_headers.statusline += ' None' if not is_ajax and self.enable_memento: self._add_memento_links(cdx['url'], full_prefix, memento_dt, cdx['timestamp'], status_headers, is_timegate, is_proxy, cdx.get('source-coll')) set_content_loc = True if set_content_loc and not self.redirect_to_exact: status_headers.headers.append(('Content-Location', urlrewriter.get_new_url(timestamp=cdx['timestamp'], url=cdx['url']))) if not is_proxy: self.add_csp_header(wb_url, status_headers) response = WbResponse(status_headers, gen) return response
def to_uri_pencode(url): return WbUrl(url).get_url()
def handle_routing(self, wb_url, user, coll_name, rec_name, type, is_embed=False, is_display=False, sources='', inv_sources='', redir_route=None): wb_url = self._full_url(wb_url) if user == '_new' and redir_route: return self.do_create_new_and_redir(coll_name, rec_name, wb_url, redir_route) sesh = self.get_session() remote_ip = None frontend_cache_header = None patch_recording = None the_user, collection, recording = self.user_manager.get_user_coll_rec(user, coll_name, rec_name) if not the_user: msg = 'not_found' if user == 'api' else 'no_such_user' self._raise_error(404, msg) coll = collection.my_id if collection else None rec = recording.my_id if recording else None if type in self.MODIFY_MODES: if sesh.is_new() and self.is_content_request(): self.redir_set_session() if not recording: self._redir_if_sanitized(self.sanitize_title(rec_name), rec_name, wb_url) # don't auto create recording for inner frame w/o accessing outer frame self._raise_error(404, 'no_such_recording') elif not recording.is_open(): # force creation of new recording as this one is closed self._raise_error(400, 'recording_not_open') collection.access.assert_can_write_coll(collection) if the_user.is_out_of_space(): self._raise_error(402, 'out_of_space') remote_ip = self._get_remote_ip() remote_ip = self.check_rate_limit(the_user, remote_ip) if inv_sources and inv_sources != '*': #patch_rec_name = self.patch_of_name(rec, True) patch_recording = recording.get_patch_recording() #patch_recording = collection.get_recording_by_name(patch_rec_name) if type in ('replay-coll', 'replay'): if not collection: self._redir_if_sanitized(self.sanitize_title(coll_name), coll_name, wb_url) if sesh.is_new() and self.is_content_request(): self.redir_set_session() else: self._raise_error(404, 'no_such_collection') access = self.access.check_read_access_public(collection) if not access: if sesh.is_new() and self.is_content_request(): self.redir_set_session() else: self._raise_error(404, 'no_such_collection') if access != 'public': frontend_cache_header = ('Cache-Control', 'private') if type == 'replay': if not recording: self._raise_error(404, 'no_such_recording') request.environ['SCRIPT_NAME'] = quote(request.environ['SCRIPT_NAME'], safe='/:') wb_url = self._context_massage(wb_url) wb_url_obj = WbUrl(wb_url) is_top_frame = (wb_url_obj.mod == self.frame_mod or wb_url_obj.mod.startswith('$br:')) if type == 'record' and is_top_frame: result = self.check_remote_archive(wb_url, type, wb_url_obj) if result: mode, wb_url = result new_url = '/{user}/{coll}/{rec}/{mode}/{url}'.format(user=user, coll=coll_name, rec=rec_name, mode=mode, url=wb_url) return self.redirect(new_url) elif type == 'replay-coll' and not is_top_frame: collection.sync_coll_index(exists=False, do_async=False) kwargs = dict(user=user, id=sesh.get_id(), coll=coll, rec=rec, coll_name=quote(coll_name), rec_name=quote(rec_name, safe='/*'), the_user=the_user, collection=collection, recording=recording, patch_recording=patch_recording, type=type, sources=sources, inv_sources=inv_sources, patch_rec=patch_recording.my_id if patch_recording else None, ip=remote_ip, is_embed=is_embed, is_display=is_display) # top-frame replay but through a proxy, redirect to original if is_top_frame and 'wsgiprox.proxy_host' in request.environ: kwargs['url'] = wb_url_obj.url kwargs['request_ts'] = wb_url_obj.timestamp self.browser_mgr.update_local_browser(kwargs) return redirect(wb_url_obj.url) try: self.check_if_content(wb_url_obj, request.environ, is_top_frame) resp = self.render_content(wb_url, kwargs, request.environ) if frontend_cache_header: resp.status_headers.headers.append(frontend_cache_header) resp = HTTPResponse(body=resp.body, status=resp.status_headers.statusline, headers=resp.status_headers.headers) return resp except UpstreamException as ue: err_context = { 'url': ue.url, 'status': ue.status_code, 'error': ue.msg.get('error'), 'timestamp': wb_url_obj.timestamp if wb_url_obj else '', 'user': user, 'coll': coll_name, 'rec': rec_name, 'type': type, 'app_host': self.app_host, } @self.jinja2_view('content_error.html') def handle_error(error): response.status = ue.status_code return error if self.content_error_redirect: return redirect(self.content_error_redirect + '?' + urlencode(err_context), code=307) else: return handle_error(err_context)
def render_content(self, wb_url, kwargs, environ): wb_url = wb_url.replace('#', '%23') wb_url = WbUrl(wb_url) history_page = environ.pop('HTTP_X_WOMBAT_HISTORY_PAGE', '') if history_page: wb_url.url = history_page is_ajax = True else: is_ajax = self.is_ajax(environ) is_timegate = self._check_accept_dt(wb_url, environ) self.prepare_env(environ) host_prefix = environ['pywb.host_prefix'] rel_prefix = self.get_rel_prefix(environ) full_prefix = host_prefix + rel_prefix pywb_static_prefix = environ['pywb.static_prefix'] + '/' is_proxy = ('wsgiprox.proxy_host' in environ) # if OPTIONS in proxy mode, just generate the proxy responss if is_proxy and self.is_preflight(environ): return WbResponse.options_response(environ) if self.use_js_obj_proxy: content_rw = self.js_proxy_rw else: content_rw = self.default_rw # no redirects if in proxy redirect_to_exact = self.redirect_to_exact and not is_proxy # Check Prefer pref_mod, pref_applied = self._get_prefer_mod(wb_url, environ, content_rw, is_proxy) response = None keep_frame_response = False # prefer overrides custom response? if pref_mod is not None: # fast-redirect to preferred if redirect_to_exact and not is_timegate and pref_mod != wb_url.mod: new_url = full_prefix + wb_url.to_str(mod=pref_mod) headers = [('Preference-Applied', pref_applied), ('Vary', 'Prefer')] return WbResponse.redir_response(new_url, '307 Temporary Redirect', headers=headers) else: wb_url.mod = pref_mod else: if kwargs.get('output'): response = self.handle_timemap(wb_url, kwargs, full_prefix) elif wb_url.is_query(): response = self.handle_query(environ, wb_url, kwargs, full_prefix) else: response = self.handle_custom_response(environ, wb_url, full_prefix, host_prefix, kwargs) keep_frame_response = (not kwargs.get('no_timegate_check') and is_timegate and not is_proxy) or redirect_to_exact if response and not keep_frame_response: return self.format_response(response, wb_url, full_prefix, is_timegate, is_proxy) if is_proxy: environ['pywb_proxy_magic'] = environ['wsgiprox.proxy_host'] urlrewriter = IdentityUrlRewriter(wb_url, '') framed_replay = False else: urlrewriter = UrlRewriter(wb_url, prefix=full_prefix, full_prefix=full_prefix, rel_prefix=rel_prefix, pywb_static_prefix=pywb_static_prefix) framed_replay = self.framed_replay url_parts = urlsplit(wb_url.url) if not url_parts.path: return self.send_redirect('/', url_parts, urlrewriter) self.unrewrite_referrer(environ, full_prefix) urlkey = canonicalize(wb_url.url) inputreq = RewriteInputRequest(environ, urlkey, wb_url.url, content_rw) inputreq.include_method_query(wb_url.url) range_start, range_end, skip_record = self._check_range( inputreq, wb_url) setcookie_headers = None cookie_key = None if self.cookie_tracker: cookie_key = self.get_cookie_key(kwargs) if cookie_key: res = self.cookie_tracker.get_cookie_headers( wb_url.url, urlrewriter, cookie_key, environ.get('HTTP_COOKIE', '')) inputreq.extra_cookie, setcookie_headers = res r = self._do_req(inputreq, wb_url, kwargs, skip_record) if r.status_code >= 400: error = None try: error = r.raw.read() except Exception: pass finally: no_except_close(r.raw) if error: error = error.decode('utf-8') else: error = '' details = dict(args=kwargs, error=error) if r.status_code == 404: raise NotFoundException(url=wb_url.url, msg=details) else: raise UpstreamException(r.status_code, url=wb_url.url, details=details) cdx = CDXObject(r.headers.get('Warcserver-Cdx').encode('utf-8')) cdx_url_parts = urlsplit(cdx['url']) if cdx_url_parts.path.endswith( '/') and not url_parts.path.endswith('/'): # add trailing slash new_path = url_parts.path + '/' no_except_close(r.raw) return self.send_redirect(new_path, url_parts, urlrewriter) # only redirect to exact if not live, otherwise set to false redirect_to_exact = redirect_to_exact and not cdx.get('is_live') # return top-frame timegate response, with timestamp from cdx if response and keep_frame_response and (not redirect_to_exact or not is_timegate): no_except_close(r.raw) return self.format_response(response, wb_url, full_prefix, is_timegate, is_proxy, cdx['timestamp']) stream = BufferedReader(r.raw, block_size=BUFF_SIZE) record = self.loader.parse_record_stream(stream, ensure_http_headers=True) memento_dt = r.headers.get('Memento-Datetime') target_uri = r.headers.get('WARC-Target-URI') # cdx['urlkey'] = urlkey # cdx['timestamp'] = http_date_to_timestamp(memento_dt) # cdx['url'] = target_uri set_content_loc = False # Check if Fuzzy Match if target_uri != wb_url.url and cdx.get('is_fuzzy') == '1': set_content_loc = True # if redirect to exact timestamp (only set if not live) if redirect_to_exact: if set_content_loc or is_timegate or wb_url.timestamp != cdx.get( 'timestamp'): new_url = urlrewriter.get_new_url(url=target_uri, timestamp=cdx['timestamp'], mod=wb_url.mod) resp = WbResponse.redir_response(new_url, '307 Temporary Redirect') if self.enable_memento: if is_timegate and not is_proxy: self._add_memento_links(target_uri, full_prefix, memento_dt, cdx['timestamp'], resp.status_headers, is_timegate, is_proxy, pref_applied=pref_applied, mod=pref_mod, is_memento=False) else: resp.status_headers['Link'] = MementoUtils.make_link( target_uri, 'original') return resp self._add_custom_params(cdx, r.headers, kwargs, record) if self._add_range(record, wb_url, range_start, range_end): wb_url.mod = 'id_' if is_ajax: head_insert_func = None urlrewriter.rewrite_opts['is_ajax'] = True else: top_url = self.get_top_url(full_prefix, wb_url, cdx, kwargs) head_insert_func = (self.head_insert_view.create_insert_func( wb_url, full_prefix, host_prefix, top_url, environ, framed_replay, coll=kwargs.get('coll', ''), replay_mod=self.replay_mod, metadata=kwargs.get('metadata', {}), config=self.config)) cookie_rewriter = None if self.cookie_tracker and cookie_key: # skip add cookie if service worker is not 200 # it seems cookie headers from service workers are not applied, so don't update in cache if wb_url.mod == 'sw_': cookie_key = None cookie_rewriter = self.cookie_tracker.get_rewriter( urlrewriter, cookie_key) urlrewriter.rewrite_opts['ua_string'] = environ.get('HTTP_USER_AGENT') result = content_rw(record, urlrewriter, cookie_rewriter, head_insert_func, cdx, environ) status_headers, gen, is_rw = result if history_page: title = DefaultRewriter._extract_title(gen) if not title: title = unquote(environ.get('HTTP_X_WOMBAT_HISTORY_TITLE', '')) if not title: title = history_page self._add_history_page(cdx, kwargs, title) return WbResponse.json_response({'title': title}) if setcookie_headers: status_headers.headers.extend(setcookie_headers) if ' ' not in status_headers.statusline: status_headers.statusline += ' None' if not is_ajax and self.enable_memento: self._add_memento_links(cdx['url'], full_prefix, memento_dt, cdx['timestamp'], status_headers, is_timegate, is_proxy, cdx.get('source-coll'), mod=pref_mod, pref_applied=pref_applied) set_content_loc = True if set_content_loc and not redirect_to_exact and not is_proxy: status_headers.headers.append( ('Content-Location', urlrewriter.get_new_url(timestamp=cdx['timestamp'], url=cdx['url']))) if not is_proxy: self.add_csp_header(wb_url, status_headers) response = WbResponse(status_headers, gen) if is_proxy and environ.get('HTTP_ORIGIN'): response.add_access_control_headers(environ) if r.status_code == 200 and kwargs.get( 'cache') == 'always' and environ.get('HTTP_REFERER'): response.status_headers[ 'Cache-Control'] = 'public, max-age=31536000, immutable' return response
def to_str(self, **overrides): overrides['mod'] = '' overrides['timestamp'] = '' return WbUrl.to_str(self, **overrides)
def handle_magic_page(self, env): request_url = env['REL_REQUEST_URI'] parts = urlsplit(request_url) server_name = env['pywb.proxy_host'] path_url = parts.path[1:] if parts.query: path_url += '?' + parts.query if server_name.startswith('auto'): coll, ts, sesh_id = self.get_coll(env) if coll: return self.make_sethost_cookie_response(sesh_id, path_url, env) else: return self.make_magic_response('select', path_url, env) elif server_name.startswith('query.'): wb_url = WbUrl(path_url) # only dealing with specific timestamp setting if wb_url.is_query(): return None coll, ts, sesh_id = self.get_coll(env) if not coll: return self.make_magic_response('select', path_url, env) self.set_ts(sesh_id, wb_url.timestamp) return self.make_redir_response(wb_url.url) elif server_name.endswith(self.set_prefix): old_sesh_id = extract_client_cookie(env, self.cookie_name) sesh_id = self.create_renew_sesh_id(old_sesh_id) if sesh_id != old_sesh_id: headers = self.make_cookie_headers(sesh_id, self.magic_name) else: headers = None coll = server_name[:-len(self.set_prefix)] # set sesh value self.set_coll(sesh_id, coll) return self.make_sethost_cookie_response(sesh_id, path_url, env, headers=headers) elif self.sethost_prefix in server_name: inx = server_name.find(self.sethost_prefix) sesh_id = server_name[:inx] domain = server_name[inx + len(self.sethost_prefix):] headers = self.make_cookie_headers(sesh_id, domain) full_url = env['pywb.proxy_scheme'] + '://' + domain full_url += '/' + path_url return self.make_redir_response(full_url, headers=headers) elif 'select.' in server_name: coll, ts, sesh_id = self.get_coll(env) route_temp = '-set.' + self.magic_name + '/' + path_url return (self.proxy_select_view. render_response(routes=self.routes, route_temp=route_temp, coll=coll, url=path_url))