def timegate(request, url): # impose an arbitrary length-limit on the submitted URL, so that the headers don't become illegally large url = url_with_qs_and_hash(url, request.META['QUERY_STRING'])[:500] data = memento_data_for_url(request, url) if not data: return HttpResponseNotFound('404 page not found\n') accept_datetime = request.META.get('HTTP_ACCEPT_DATETIME') if accept_datetime: accept_datetime = parse_date(accept_datetime) if not accept_datetime: return HttpResponseBadRequest('Invalid value for Accept-Datetime.') else: accept_datetime = timezone.now() accept_datetime = accept_datetime.replace(tzinfo=tzutc()) target, target_datetime = closest(map(lambda m: m.values(), data['mementos']['list']), accept_datetime) response = redirect(target) response['Vary'] = 'accept-datetime' response['Link'] = str( LinkHeader([ Rel(data['original_uri'], rel='original'), Rel(data['timegate_uri'], rel='timegate'), Rel(data['timemap_uri']['link_format'], rel='timemap', type='application/link-format'), Rel(data['timemap_uri']['json_format'], rel='timemap', type='application/json'), Rel(data['timemap_uri']['html_format'], rel='timemap', type='text/html'), Rel(data['mementos']['first']['uri'], rel='first memento', datetime=datetime_to_http_date(data['mementos']['first']['datetime'])), Rel(data['mementos']['last']['uri'], rel='last memento', datetime=datetime_to_http_date(data['mementos']['last']['datetime'])), Rel(target, rel='memento', datetime=datetime_to_http_date(target_datetime)), ]) ) return response
def _update_redis_and_cookie(self, pi, set_cookie, session, headers): duration = self.durations[session.dura_type]['total'] if session.should_save: data = base64.b64encode(pickle.dumps(session._sesh)) ttl = session.ttl if ttl < 0: ttl = duration pi.setex(session.key, ttl, data) if not set_cookie: return self.track_long_term(session) expires = datetime.utcnow() + timedelta(seconds=duration) # set redis duration pi.expire(session.key, duration) # set cookie sesh_cookie = self.id_to_signed_cookie(session['id'], session.is_restricted) value = '{0}={1}; Path=/; HttpOnly; max-age={3}' value = value.format(self.sesh_key, sesh_cookie, datetime_to_http_date(expires), duration) scheme = session.environ.get('wsgi.url_scheme', '') if scheme.lower() == 'https': value += '; Secure' headers.append(('Set-Cookie', value))
def timemap(request, response_format, url): url = url_with_qs_and_hash(url, request.META['QUERY_STRING']) data = memento_data_for_url(request, url) if data: if response_format == 'json': response = JsonResponse(data) elif response_format == 'html': response = render(request, 'memento/timemap.html', data) else: content_type = 'application/link-format' file = StringIO() file.writelines(f"{line},\n" for line in [ Rel(data['original_uri'], rel='original'), Rel(data['timegate_uri'], rel='timegate'), Rel(data['self'], rel='self', type='application/link-format'), Rel(data['timemap_uri']['link_format'], rel='timemap', type='application/link-format'), Rel(data['timemap_uri']['json_format'], rel='timemap', type='application/json'), Rel(data['timemap_uri']['html_format'], rel='timemap', type='text/html') ] + [ Rel(memento['uri'], rel='memento', datetime=datetime_to_http_date(memento['datetime'])) for memento in data['mementos']['list'] ]) file.seek(0) response = HttpResponse(file, content_type=f'{content_type}') else: if response_format == 'html': response = render(request, 'memento/timemap.html', {"original_uri": url}, status=404) else: response = HttpResponseNotFound('404 page not found\n') response['X-Memento-Count'] = str(len(data['mementos']['list'])) if data else 0 return response
def _add_cache_headers(self, new_headers, http_cache): try: age = int(http_cache) except: age = 0 if age <= 0: new_headers.append(('Cache-Control', 'no-cache; no-store')) else: dt = datetime.utcnow() dt = dt + timedelta(seconds=age) new_headers.append(('Cache-Control', 'max-age=' + str(age))) new_headers.append(('Expires', datetime_to_http_date(dt)))
def _update_redis_and_cookie(self, set_cookie, session, headers): duration = self.durations[session.dura_type]['total'] if session.should_save: with redis_pipeline(self.redis) as pi: data = base64.b64encode(pickle.dumps(session._sesh)) ttl = session.ttl # PERMA CUSTOMIZATION: changed from < to <= # https://github.com/webrecorder/webrecorder/pull/721 if ttl <= 0: ttl = duration pi.setex(session.key, ttl, data) if set_cookie: self.track_long_term(session, pi) # set redis duration if session.curr_role != 'anon': pi.expire(session.key, duration) elif set_cookie and session.curr_role != 'anon': # extend redis duration if extending cookie! self.redis.expire(session.key, duration) if not set_cookie: return expires = datetime.utcnow() + timedelta(seconds=duration) # set cookie sesh_cookie = session.get_cookie() value = '{0}={1}; Path=/; HttpOnly' # add max-age only if: # - long duration session # - anonymous session (not restricted) # don't set for restricted session, as cookie only valid as long as top session exists if session.dura_type == 'long' or session.curr_role == 'anon': value += '; max-age={3}' value = value.format(self.sesh_key, sesh_cookie, datetime_to_http_date(expires), duration) scheme = session.environ.get('wsgi.url_scheme', '') if scheme.lower() == 'https': value += '; Secure' headers.append(('Set-Cookie', value))
def _make_cache_headers(): cache_headers = [('Content-Length', '123'), ('Cache-Control', 'max-age=10'), ('Expires', datetime_to_http_date(datetime.now())), ('ETag', '123456')] return cache_headers
def single_permalink(request, guid): """ Given a Perma ID, serve it up. """ raw_user_agent = request.META.get('HTTP_USER_AGENT', '') # Create a canonical version of guid (non-alphanumerics removed, hyphens every 4 characters, uppercase), # and forward to that if it's different from current guid. canonical_guid = Link.get_canonical_guid(guid) # We only do the redirect if the correctly-formatted GUID actually exists -- # this prevents actual 404s from redirecting with weird formatting. link = get_object_or_404(Link.objects.all_with_deleted(), guid=canonical_guid) if canonical_guid != guid: return HttpResponsePermanentRedirect( reverse('single_permalink', args=[canonical_guid])) # Forward to replacement link if replacement_link is set. if link.replacement_link_id: return HttpResponseRedirect( reverse('single_permalink', args=[link.replacement_link_id])) # If we get an unrecognized archive type (which could be an old type like 'live' or 'pdf'), forward to default version serve_type = request.GET.get('type') if serve_type is None: serve_type = 'source' elif serve_type not in valid_serve_types: return HttpResponsePermanentRedirect( reverse('single_permalink', args=[canonical_guid])) # serve raw WARC if serve_type == 'warc_download': return stream_warc_if_permissible(link, request.user) # handle requested capture type if serve_type == 'image': capture = link.screenshot_capture # not all Perma Links have screenshots; if no screenshot is present, # forward to primary capture for playback or for appropriate error message if (not capture or capture.status != 'success') and link.primary_capture: return HttpResponseRedirect( reverse('single_permalink', args=[guid])) else: capture = link.primary_capture # if primary capture did not work, but screenshot did work, forward to screenshot if ( not capture or capture.status != 'success' ) and link.screenshot_capture and link.screenshot_capture.status == 'success': return HttpResponseRedirect( reverse('single_permalink', args=[guid]) + "?type=image") try: capture_mime_type = capture.mime_type() except AttributeError: # If capture is deleted, then mime type does not exist. Catch error. capture_mime_type = None # Special handling for mobile pdf viewing because it can be buggy # Redirecting to a download page if on mobile redirect_to_download_view = redirect_to_download(capture_mime_type, raw_user_agent) # If this record was just created by the current user, show them a new record message new_record = request.user.is_authenticated and link.created_by_id == request.user.id and not link.user_deleted \ and link.creation_timestamp > timezone.now() - timedelta(seconds=300) # Provide the max upload size, in case the upload form is used max_size = settings.MAX_ARCHIVE_FILE_SIZE / 1024 / 1024 if not link.submitted_description: link.submitted_description = "This is an archive of %s from %s" % ( link.submitted_url, link.creation_timestamp.strftime("%A %d, %B %Y")) logger.info(f"Preparing context for {link.guid}") context = { 'link': link, 'redirect_to_download_view': redirect_to_download_view, 'mime_type': capture_mime_type, 'can_view': request.user.can_view(link), 'can_edit': request.user.can_edit(link), 'can_delete': request.user.can_delete(link), 'can_toggle_private': request.user.can_toggle_private(link), 'capture': capture, 'serve_type': serve_type, 'new_record': new_record, 'this_page': 'single_link', 'max_size': max_size, 'link_url': settings.HOST + '/' + link.guid, 'protocol': protocol(), } if context['can_view'] and link.can_play_back(): try: logger.info(f"Initializing play back of {link.guid}") wr_username = link.init_replay_for_user(request) except Exception: # noqa # We are experiencing many varieties of transient flakiness in playback: # second attempts, triggered by refreshing the page, almost always seem to work. # While we debug... let's give playback a second try here, and see if this # noticeably improves user experience. logger.exception( f"First attempt to init replay of {link.guid} failed. (Retrying: observe whether this error recurs.)" ) time.sleep(settings.WR_PLAYBACK_RETRY_AFTER) logger.info(f"Initializing play back of {link.guid} (2nd try)") wr_username = link.init_replay_for_user(request) logger.info( f"Updating context with WR playback information for {link.guid}") context.update({ 'wr_host': settings.PLAYBACK_HOST, 'wr_prefix': link.wr_iframe_prefix(wr_username), 'wr_url': capture.url, 'wr_timestamp': link.creation_timestamp.strftime('%Y%m%d%H%M%S'), }) logger.info(f"Rendering template for {link.guid}") response = render(request, 'archive/single-link.html', context) # Adjust status code if link.user_deleted: response.status_code = 410 elif not context['can_view'] and link.is_private: response.status_code = 403 # Add memento headers, when appropriate logger.info(f"Deciding whether to include memento headers for {link.guid}") if link.is_visible_to_memento(): logger.info(f"Including memento headers for {link.guid}") response['Memento-Datetime'] = datetime_to_http_date( link.creation_timestamp) # impose an arbitrary length-limit on the submitted URL, so that this header doesn't become illegally large url = link.submitted_url[:500] response['Link'] = str( LinkHeader([ Rel(url, rel='original'), Rel(timegate_url(request, url), rel='timegate'), Rel(timemap_url(request, url, 'link'), rel='timemap', type='application/link-format'), Rel(timemap_url(request, url, 'json'), rel='timemap', type='application/json'), Rel(timemap_url(request, url, 'html'), rel='timemap', type='text/html'), Rel(memento_url(request, link), rel='memento', datetime=datetime_to_http_date(link.creation_timestamp)), ])) logger.info(f"Returning response for {link.guid}") return response
def __call__(self, cdx, params): entry = self.load_resource(cdx, params) if not entry: return None, None compress = params.get('compress') == 'gzip' warc_headers, other_headers, stream = entry source = self._get_source_id(cdx) out_headers = {} out_headers['Warcserver-Type'] = 'warc' out_headers['Content-Type'] = 'application/warc-record' if params.get('recorder_skip'): out_headers['Recorder-Skip'] = '1' cdx['recorder_skip'] = '1' out_headers['Warcserver-Cdx'] = to_native_str(cdx.to_cdxj().rstrip()) out_headers['Warcserver-Source-Coll'] = to_native_str(source) if not warc_headers: if other_headers: out_headers['Link'] = other_headers.get('Link') out_headers['Memento-Datetime'] = other_headers.get( 'Memento-Datetime') if not compress: out_headers['Content-Length'] = other_headers.get( 'Content-Length') return out_headers, StreamIter(stream, closer=call_release_conn) target_uri = warc_headers.get_header('WARC-Target-URI') out_headers['WARC-Target-URI'] = target_uri out_headers['Link'] = MementoUtils.make_link(target_uri, 'original') memento_dt = iso_date_to_datetime(warc_headers.get_header('WARC-Date')) out_headers['Memento-Datetime'] = datetime_to_http_date(memento_dt) warc_headers_buff = warc_headers.to_bytes() if not compress: lenset = self._set_content_len( warc_headers.get_header('Content-Length'), out_headers, len(warc_headers_buff)) else: lenset = False streamiter = StreamIter(stream, header1=warc_headers_buff, header2=other_headers, closer=call_release_conn) if compress: streamiter = compress_gzip_iter(streamiter) out_headers['Content-Encoding'] = 'gzip' #if not lenset: # out_headers['Transfer-Encoding'] = 'chunked' # streamiter = chunk_encode_iter(streamiter) return out_headers, streamiter
def load_resource(self, cdx, params): load_url = cdx.get('load_url') if not load_url: return None if params.get('content_type') == VideoLoader.CONTENT_TYPE: return None if self.forward_proxy_prefix and not cdx.get('is_live'): load_url = self.forward_proxy_prefix + load_url input_req = params['_input_req'] req_headers = input_req.get_req_headers() dt = timestamp_to_datetime(cdx['timestamp']) if cdx.get('memento_url'): req_headers['Accept-Datetime'] = datetime_to_http_date(dt) method = input_req.get_req_method() data = input_req.get_req_body() p = PreparedRequest() try: p.prepare_url(load_url, None) except: raise LiveResourceException(load_url) p.prepare_headers(None) p.prepare_auth(None, load_url) auth = p.headers.get('Authorization') if auth: req_headers['Authorization'] = auth load_url = p.url # host is set to the actual host for live loading # ensure it is set to the load_url host if not cdx.get('is_live'): #req_headers.pop('Host', '') req_headers['Host'] = urlsplit(p.url).netloc referrer = cdx.get('set_referrer') if referrer: req_headers['Referer'] = referrer upstream_res = self._do_request_with_redir_check( method, load_url, data, req_headers, params, cdx) memento_dt = upstream_res.headers.get('Memento-Datetime') if memento_dt: dt = http_date_to_datetime(memento_dt) cdx['timestamp'] = datetime_to_timestamp(dt) elif cdx.get('memento_url'): # if 'memento_url' set and no Memento-Datetime header present # then its an error return None agg_type = upstream_res.headers.get('Warcserver-Type') if agg_type == 'warc': cdx['source'] = unquote( upstream_res.headers.get('Warcserver-Source-Coll')) return None, upstream_res.headers, upstream_res if upstream_res.version == 11: version = '1.1' else: version = '1.0' status = 'HTTP/{version} {status} {reason}\r\n' status = status.format(version=version, status=upstream_res.status, reason=upstream_res.reason) http_headers_buff = status orig_resp = upstream_res._original_response try: #pragma: no cover #PY 3 resp_headers = orig_resp.headers._headers for n, v in resp_headers: nl = n.lower() if nl in self.SKIP_HEADERS: continue if nl in self.UNREWRITE_HEADERS: v = self.unrewrite_header(cdx, v) http_headers_buff += n + ': ' + v + '\r\n' http_headers_buff += '\r\n' try: # http headers could be encoded as utf-8 (though non-standard) # first try utf-8 encoding http_headers_buff = http_headers_buff.encode('utf-8') except: # then, fall back to latin-1 http_headers_buff = http_headers_buff.encode('latin-1') except: #pragma: no cover #PY 2 resp_headers = orig_resp.msg.headers for line in resp_headers: n, v = line.split(':', 1) n = n.lower() v = v.strip() if n in self.SKIP_HEADERS: continue new_v = v if n in self.UNREWRITE_HEADERS: new_v = self.unrewrite_header(cdx, v) if new_v != v: http_headers_buff += n + ': ' + new_v + '\r\n' else: http_headers_buff += line # if python2, already byte headers, so leave as is http_headers_buff += '\r\n' try: fp = upstream_res._fp.fp if hasattr(fp, 'raw'): #pragma: no cover fp = fp.raw remote_ip = fp._sock.getpeername()[0] except: #pragma: no cover remote_ip = None warc_headers = {} warc_headers['WARC-Type'] = 'response' warc_headers['WARC-Record-ID'] = self._make_warc_id() warc_headers['WARC-Target-URI'] = cdx['url'] warc_headers['WARC-Date'] = datetime_to_iso_date(dt) if not cdx.get('is_live'): now = datetime.datetime.utcnow() warc_headers['WARC-Source-URI'] = cdx.get('load_url') warc_headers['WARC-Creation-Date'] = datetime_to_iso_date(now) if remote_ip: warc_headers['WARC-IP-Address'] = remote_ip ct = upstream_res.headers.get('Content-Type') if ct: metadata = self.get_custom_metadata(ct, dt) if metadata: warc_headers['WARC-JSON-Metadata'] = json.dumps(metadata) warc_headers['Content-Type'] = 'application/http; msgtype=response' if method == 'HEAD': content_len = 0 else: content_len = upstream_res.headers.get('Content-Length', -1) self._set_content_len(content_len, warc_headers, len(http_headers_buff)) warc_headers = StatusAndHeaders('WARC/1.0', warc_headers.items()) return (warc_headers, http_headers_buff, upstream_res)
def __call__(self, cdx, params): entry = self.load_resource(cdx, params) if not entry: return None, None compress = params.get('compress') == 'gzip' warc_headers, other_headers, stream = entry source = self._get_source_id(cdx) out_headers = {} out_headers['Warcserver-Type'] = 'warc' out_headers['Content-Type'] = 'application/warc-record' if params.get('recorder_skip'): out_headers['Recorder-Skip'] = '1' cdx['recorder_skip'] = '1' out_headers['Warcserver-Cdx'] = to_native_str(cdx.to_cdxj().rstrip()) out_headers['Warcserver-Source-Coll'] = to_native_str(source) if not warc_headers: if other_headers: out_headers['Link'] = other_headers.get('Link') out_headers['Memento-Datetime'] = other_headers.get('Memento-Datetime') if not compress: out_headers['Content-Length'] = other_headers.get('Content-Length') return out_headers, StreamIter(stream, closer=call_release_conn) target_uri = warc_headers.get_header('WARC-Target-URI') out_headers['WARC-Target-URI'] = target_uri out_headers['Link'] = MementoUtils.make_link(target_uri, 'original') memento_dt = iso_date_to_datetime(warc_headers.get_header('WARC-Date')) out_headers['Memento-Datetime'] = datetime_to_http_date(memento_dt) warc_headers_buff = warc_headers.to_bytes() if not compress: lenset = self._set_content_len(warc_headers.get_header('Content-Length'), out_headers, len(warc_headers_buff)) else: lenset = False streamiter = StreamIter(stream, header1=warc_headers_buff, header2=other_headers, closer=call_release_conn) if compress: streamiter = compress_gzip_iter(streamiter) out_headers['Content-Encoding'] = 'gzip' #if not lenset: # out_headers['Transfer-Encoding'] = 'chunked' # streamiter = chunk_encode_iter(streamiter) return out_headers, streamiter
def load_resource(self, cdx, params): load_url = cdx.get('load_url') if not load_url: return None if params.get('content_type') == VideoLoader.CONTENT_TYPE: return None if self.forward_proxy_prefix and not cdx.get('is_live'): load_url = self.forward_proxy_prefix + load_url input_req = params['_input_req'] req_headers = input_req.get_req_headers() dt = timestamp_to_datetime(cdx['timestamp']) if cdx.get('memento_url'): req_headers['Accept-Datetime'] = datetime_to_http_date(dt) method = input_req.get_req_method() data = input_req.get_req_body() p = PreparedRequest() try: p.prepare_url(load_url, None) except: raise LiveResourceException(load_url) p.prepare_headers(None) p.prepare_auth(None, load_url) auth = p.headers.get('Authorization') if auth: req_headers['Authorization'] = auth load_url = p.url # host is set to the actual host for live loading # ensure it is set to the load_url host if not cdx.get('is_live'): #req_headers.pop('Host', '') req_headers['Host'] = urlsplit(p.url).netloc referrer = cdx.get('set_referrer') if referrer: req_headers['Referer'] = referrer upstream_res = self._do_request_with_redir_check(method, load_url, data, req_headers, params, cdx) memento_dt = upstream_res.headers.get('Memento-Datetime') if memento_dt: dt = http_date_to_datetime(memento_dt) cdx['timestamp'] = datetime_to_timestamp(dt) elif cdx.get('memento_url'): # if 'memento_url' set and no Memento-Datetime header present # then its an error return None agg_type = upstream_res.headers.get('Warcserver-Type') if agg_type == 'warc': cdx['source'] = unquote(upstream_res.headers.get('Warcserver-Source-Coll')) return None, upstream_res.headers, upstream_res if upstream_res.version == 11: version = '1.1' else: version = '1.0' status = 'HTTP/{version} {status} {reason}\r\n' status = status.format(version=version, status=upstream_res.status, reason=upstream_res.reason) http_headers_buff = status orig_resp = upstream_res._original_response try: #pragma: no cover #PY 3 resp_headers = orig_resp.headers._headers for n, v in resp_headers: nl = n.lower() if nl in self.SKIP_HEADERS: continue if nl in self.UNREWRITE_HEADERS: v = self.unrewrite_header(cdx, v) http_headers_buff += n + ': ' + v + '\r\n' http_headers_buff += '\r\n' try: # http headers could be encoded as utf-8 (though non-standard) # first try utf-8 encoding http_headers_buff = http_headers_buff.encode('utf-8') except: # then, fall back to latin-1 http_headers_buff = http_headers_buff.encode('latin-1') except: #pragma: no cover #PY 2 resp_headers = orig_resp.msg.headers for line in resp_headers: n, v = line.split(':', 1) n = n.lower() v = v.strip() if n in self.SKIP_HEADERS: continue new_v = v if n in self.UNREWRITE_HEADERS: new_v = self.unrewrite_header(cdx, v) if new_v != v: http_headers_buff += n + ': ' + new_v + '\r\n' else: http_headers_buff += line # if python2, already byte headers, so leave as is http_headers_buff += '\r\n' try: fp = upstream_res._fp.fp if hasattr(fp, 'raw'): #pragma: no cover fp = fp.raw remote_ip = fp._sock.getpeername()[0] except: #pragma: no cover remote_ip = None warc_headers = {} warc_headers['WARC-Type'] = 'response' warc_headers['WARC-Record-ID'] = self._make_warc_id() warc_headers['WARC-Target-URI'] = cdx['url'] warc_headers['WARC-Date'] = datetime_to_iso_date(dt) if not cdx.get('is_live'): now = datetime.datetime.utcnow() warc_headers['WARC-Source-URI'] = cdx.get('load_url') warc_headers['WARC-Creation-Date'] = datetime_to_iso_date(now) if remote_ip: warc_headers['WARC-IP-Address'] = remote_ip ct = upstream_res.headers.get('Content-Type') if ct: metadata = self.get_custom_metadata(ct, dt) if metadata: warc_headers['WARC-JSON-Metadata'] = json.dumps(metadata) warc_headers['Content-Type'] = 'application/http; msgtype=response' if method == 'HEAD': content_len = 0 else: content_len = upstream_res.headers.get('Content-Length', -1) self._set_content_len(content_len, warc_headers, len(http_headers_buff)) warc_headers = StatusAndHeaders('WARC/1.0', warc_headers.items()) return (warc_headers, http_headers_buff, upstream_res)