def test_timegate_latest_request_timestamp(self): """ TimeGate with no Accept-Datetime header """ dt = 'Mon, 27 Jan 2014 17:12:39 GMT' resp = self.testapp.get( '/pywb-non-exact/http://www.iana.org/_css/2013.1/screen.css') assert resp.status_int == 302 assert resp.headers[VARY] == 'accept-datetime' links = self.get_links(resp) assert '<http://www.iana.org/_css/2013.1/screen.css>; rel="original"' in links assert self.make_timemap_link( 'http://www.iana.org/_css/2013.1/screen.css', coll='pywb-non-exact') in links assert self.make_memento_link( 'http://www.iana.org/_css/2013.1/screen.css', '20140127171239', dt, coll='pywb-non-exact') in links assert MEMENTO_DATETIME not in resp.headers assert '/pywb-non-exact/' in resp.headers['Location'] wburl = resp.headers['Location'].split('/pywb-non-exact/')[-1] ts = wburl.split('/')[0] assert len(ts) == 14 assert timestamp_now() >= ts
def handle_not_found(self, wbrequest, nfe): response = super(MementoHandler, self).handle_not_found(wbrequest, nfe) if (not wbrequest.wb_url.is_query() and wbrequest.referrer and wbrequest.referrer.startswith(wbrequest.wb_prefix)): wb_url = WbUrl(wbrequest.referrer[len(wbrequest.wb_prefix):]) status = response.status_headers.get_statuscode() if status.startswith('4') and not self.skip_missing_count(wb_url): key_name = 'MISSING ' elif status.startswith('2'): key_name = 'LIVE ' else: key_name = None if key_name: page_key = redis_client.get_url_key(wb_url) ts = timestamp_now() value = (key_name + ts + ' ' + wbrequest.wb_url.url) save_value = str(timestamp_to_sec(ts)) save_value += ' ' + 'text/html' redis_client.set_embed_entry(page_key, value, save_value) return response
def test_timegate_latest_request_timestamp(self): """ TimeGate with no Accept-Datetime header """ dt = 'Mon, 27 Jan 2014 17:12:39 GMT' resp = self.testapp.get('/pywb-non-exact/http://www.iana.org/_css/2013.1/screen.css') assert resp.status_int == 302 assert resp.headers[VARY] == 'accept-datetime' links = self.get_links(resp) assert '<http://www.iana.org/_css/2013.1/screen.css>; rel="original"' in links assert self.make_timemap_link('http://www.iana.org/_css/2013.1/screen.css', coll='pywb-non-exact') in links assert self.make_memento_link('http://www.iana.org/_css/2013.1/screen.css', '20140127171239', dt, coll='pywb-non-exact') in links assert MEMENTO_DATETIME not in resp.headers assert '/pywb-non-exact/' in resp.headers['Location'] wburl = resp.headers['Location'].split('/pywb-non-exact/')[-1] ts = wburl.split('/')[0] assert len(ts) == 14 assert timestamp_now() >= ts
def get_top_frame(self, wb_url, wb_prefix, host_prefix, env, frame_mod, replay_mod, coll='', extra_params=None): embed_url = wb_url.to_str(mod=replay_mod) if wb_url.timestamp: timestamp = wb_url.timestamp else: timestamp = timestamp_now() wbrequest = {'host_prefix': host_prefix, 'wb_prefix': wb_prefix, 'wb_url': wb_url, 'coll': coll, 'options': {'frame_mod': frame_mod, 'replay_mod': replay_mod}, } params = dict(embed_url=embed_url, wbrequest=wbrequest, timestamp=timestamp, url=wb_url.get_url(), banner_html=self.banner_file) if extra_params: params.update(extra_params) return self.render_to_string(env, **params)
def test_live(): url = 'http://example.com/' source = LiveIndexSource() res, errs = query_single_source(source, dict(url=url)) expected = 'com,example)/ {0} http://example.com/'.format(timestamp_now()) assert(key_ts_res(res, 'load_url') == expected) assert(errs == {})
def load_index(self, params): cdx = CDXObject() cdx['urlkey'] = params.get('key').decode('utf-8') closest = params.get('closest') cdx['timestamp'] = closest if closest else timestamp_now() cdx['url'] = params['url'] cdx['load_url'] = res_template(self.proxy_url, params) cdx['memento_url'] = cdx['load_url'] return self._do_load(cdx, params)
def __call__(self, params): if params.get('closest') == 'now': params['closest'] = timestamp_now() content_type = params.get('content_type') if content_type: params['filter'] = '=mime:' + content_type query = CDXQuery(params) cdx_iter, errs = self.load_index(query.params) cdx_iter = process_cdx(cdx_iter, query) return cdx_iter, dict(errs)
def _redirect_if_needed(self, wbrequest, cdx): if wbrequest.options['is_proxy']: return None if wbrequest.custom_params.get('noredir'): return None is_timegate = (wbrequest.options.get('is_timegate', False)) if not is_timegate: is_timegate = wbrequest.wb_url.is_latest_replay() redir_needed = is_timegate if not redir_needed and self.redir_to_exact: redir_needed = (cdx['timestamp'] != wbrequest.wb_url.timestamp) if not redir_needed: return None if self.enable_range_cache and wbrequest.extract_range(): return None if is_timegate and not self.redir_to_exact: timestamp = timestamp_now() else: timestamp = cdx['timestamp'] new_url = (wbrequest.urlrewriter. get_new_url(timestamp=timestamp, url=cdx['url'])) if wbrequest.method == 'POST': # FF shows a confirm dialog, so can't use 307 effectively # was: statusline = '307 Same-Method Internal Redirect' return None elif is_timegate: statusline = '302 Found' else: # clear cdx line to indicate internal redirect statusline = '302 Internal Redirect' cdx = None status_headers = StatusAndHeaders(statusline, [('Location', new_url)]) return self.response_class(status_headers, wbrequest=wbrequest, cdx=cdx)
def _redirect_if_needed(self, wbrequest, cdx): if wbrequest.options['is_proxy']: return None if wbrequest.custom_params.get('noredir'): return None is_timegate = (wbrequest.options.get('is_timegate', False)) if not is_timegate: is_timegate = wbrequest.wb_url.is_latest_replay() redir_needed = is_timegate if not redir_needed and self.redir_to_exact: redir_needed = (cdx['timestamp'] != wbrequest.wb_url.timestamp) if not redir_needed: return None if self.enable_range_cache and wbrequest.extract_range(): return None if is_timegate and not self.redir_to_exact: timestamp = timestamp_now() else: timestamp = cdx['timestamp'] new_url = (wbrequest.urlrewriter.get_new_url(timestamp=timestamp, url=cdx['url'])) if wbrequest.method == 'POST': # FF shows a confirm dialog, so can't use 307 effectively # was: statusline = '307 Same-Method Internal Redirect' return None elif is_timegate: statusline = '302 Found' else: # clear cdx line to indicate internal redirect statusline = '302 Internal Redirect' cdx = None status_headers = StatusAndHeaders(statusline, [('Location', new_url)]) return self.response_class(status_headers, wbrequest=wbrequest, cdx=cdx)
def _get_timemap_query(self, params): from_ts = params.get('from') if from_ts: from_ts = pad_timestamp(from_ts, EARLIEST_DATE) else: from_ts = EARLIEST_DATE to_ts = params.get('to') if not to_ts: to_ts = timestamp_now() else: to_ts = pad_timestamp(to_ts, LATEST_DATE) query = 'exacturlexpand:{0} date:{1}-{2}'.format( params.get('url'), from_ts, to_ts) return query
def _get_timemap_query(self, params): from_ts = params.get('from') if from_ts: from_ts = pad_timestamp(from_ts, EARLIEST_DATE) else: from_ts = EARLIEST_DATE to_ts = params.get('to') if not to_ts: to_ts = timestamp_now() else: to_ts = pad_timestamp(to_ts, LATEST_DATE) query = 'exacturlexpand:{0} date:{1}-{2}'.format(params.get('url'), from_ts, to_ts) return query
def test_redirect_non_exact_latest_replay_ts(self): resp = self.testapp.get('/pywb-non-exact/http://example.com/') assert resp.status_int == 302 assert resp.headers['Location'].endswith('/http://example.com') # extract ts, which should be current time ts = resp.headers['Location'].rsplit('/http://')[0].rsplit('/', 1)[-1] assert len(ts) == 14, ts resp = resp.follow() self._assert_basic_html(resp) # ensure the current ts is present in the links assert '"{0}"'.format(ts) in resp.body assert '/pywb-non-exact/{0}/http://www.iana.org/domains/example'.format(ts) in resp.body # ensure ts is current ts assert timestamp_now() >= ts, ts
def add_page(self, user, coll, rec, pagedata): self.assert_can_write(user, coll) key = self.page_key.format(user=user, coll=coll, rec=rec) url = pagedata['url'] if not pagedata.get('timestamp'): pagedata['timestamp'] = self._get_url_ts(user, coll, rec, url) if not pagedata['timestamp']: pagedata['timestamp'] = timestamp_now() pagedata_json = json.dumps(pagedata).encode('utf-8') self.redis.hset(key, pagedata['url'] + ' ' + pagedata['timestamp'], pagedata_json) return {}
def test_redirect_non_exact_latest_replay_ts(self): resp = self.testapp.get('/pywb-non-exact/http://example.com/') assert resp.status_int == 302 assert resp.headers['Location'].endswith('/http://example.com') # extract ts, which should be current time ts = resp.headers['Location'].rsplit('/http://')[0].rsplit('/', 1)[-1] assert len(ts) == 14, ts resp = resp.follow() self._assert_basic_html(resp) # ensure the current ts is present in the links assert '"{0}"'.format(ts) in resp.body assert '/pywb-non-exact/{0}/http://www.iana.org/domains/example'.format( ts) in resp.body # ensure ts is current ts assert timestamp_now() >= ts, ts
def download_coll(): coll = request.query.get('coll') ts = timestamp_now() if coll.startswith('@anon'): user = manager.get_anon_user() filename = 'webarchive-all-{0}.warc.gz'.format(ts) else: user, coll = path_parser.get_user_coll(coll) filename = '{0}-{1}-all.warc.gz'.format(user, coll, ts) res = manager.download_all(user, coll) if not res: raise HTTPError(status=404, body='No Download Data Available') length, func = res response.headers['Content-Type'] = 'text/plain' response.headers['Content-Disposition'] = 'attachment; filename=' + filename response.headers['Content-Length'] = length response.body = func() return response
def download_coll(): coll = request.query.get('coll') ts = timestamp_now() if coll.startswith('@anon'): user = manager.get_anon_user() filename = 'webarchive-all-{0}.warc.gz'.format(ts) else: user, coll = path_parser.get_user_coll(coll) filename = '{0}-{1}-all.warc.gz'.format(user, coll, ts) res = manager.download_all(user, coll) if not res: raise HTTPError(status=404, body='No Download Data Available') length, func = res response.headers['Content-Type'] = 'text/plain' response.headers[ 'Content-Disposition'] = 'attachment; filename=' + filename response.headers['Content-Length'] = length response.body = func() return response
def import_pages(self, user, coll, rec, pagelist): self.assert_can_admin(user, coll) key = self.page_key.format(user=user, coll=coll, rec=rec) pagemap = {} for pagedata in pagelist: url = pagedata['url'] if not pagedata.get('timestamp'): pagedata['timestamp'] = self._get_url_ts(user, coll, rec, url) if not pagedata['timestamp']: pagedata['timestamp'] = timestamp_now() pagedata_json = json.dumps(pagedata).encode('utf-8') pagemap[pagedata['url'] + ' ' + pagedata['timestamp']] = pagedata_json self.redis.hmset(key, pagemap) return {}
def fetch_request(self, url, urlrewriter, head_insert_func=None, urlkey=None, env=None, req_headers={}, timestamp=None, follow_redirects=False, skip_recording=False, verify=True, remote_only=True): ts_err = url.split('///') # fixup for accidental erroneous rewrite which has /// # (unless file:///) if len(ts_err) > 1 and ts_err[0] != 'file:': url = 'http://' + ts_err[1] if url.startswith('//'): url = 'http:' + url if remote_only or is_http(url): is_remote = True else: is_remote = False if not url.startswith('file:'): url = to_file_url(url) # explicit urlkey may be passed in (say for testing) if not urlkey: urlkey = canonicalize(url) if is_remote: (status_headers, stream) = self.fetch_http(url, urlkey, env, req_headers, follow_redirects, skip_recording, verify) else: (status_headers, stream) = self.fetch_local_file(url) if timestamp is None: timestamp = timestamp_now() cdx = {'urlkey': urlkey, 'timestamp': timestamp, 'url': url, 'status': status_headers.get_statuscode(), 'mime': status_headers.get_header('Content-Type'), 'is_live': True, } result = (self.rewriter. rewrite_content(urlrewriter, status_headers, stream, head_insert_func=head_insert_func, urlkey=urlkey, cdx=cdx)) if env: env['pywb.cdx'] = cdx return result
def fetch_request(self, url, urlrewriter, head_insert_func=None, urlkey=None, env=None, req_headers={}, timestamp=None, follow_redirects=False, skip_recording=False, verify=True, remote_only=True): ts_err = url.split('///') # fixup for accidental erroneous rewrite which has /// # (unless file:///) if len(ts_err) > 1 and ts_err[0] != 'file:': url = 'http://' + ts_err[1] if url.startswith('//'): url = 'http:' + url if remote_only or is_http(url): is_remote = True else: is_remote = False if not url.startswith('file:'): url = to_file_url(url) # explicit urlkey may be passed in (say for testing) if not urlkey: urlkey = canonicalize(url) if is_remote: (status_headers, stream) = self.fetch_http(url, urlkey, env, req_headers, follow_redirects, skip_recording, verify) else: (status_headers, stream) = self.fetch_local_file(url) if timestamp is None: timestamp = timestamp_now() cdx = { 'urlkey': urlkey, 'timestamp': timestamp, 'url': url, 'status': status_headers.get_statuscode(), 'mime': status_headers.get_header('Content-Type'), 'is_live': True, } result = (self.rewriter.rewrite_content( urlrewriter, status_headers, stream, head_insert_func=head_insert_func, urlkey=urlkey, cdx=cdx)) if env: env['pywb.cdx'] = cdx return result
def handle_download(self, user, coll, rec): collection = self.manager.get_collection(user, coll, rec) if not collection: self._raise_error(404, 'Collection not found', id=coll) now = timestamp_now() name = collection['id'] if rec != '*': rec_list = rec.split(',') if len(rec_list) == 1: name = rec else: name += '-' + rec else: rec_list = None filename = self.download_filename.format(title=quote(name), timestamp=now) loader = BlockLoader() coll_info = self.create_coll_warcinfo(user, collection, filename) def iter_infos(): for recording in collection['recordings']: if rec_list and recording['id'] not in rec_list: continue warcinfo = self.create_rec_warcinfo(user, collection, recording, filename) size = len(warcinfo) size += recording['size'] yield recording, warcinfo, size def read_all(infos): yield coll_info for recording, warcinfo, _ in infos: yield warcinfo for warc_path in self._iter_all_warcs(user, coll, recording['id']): try: fh = loader.load(warc_path) except: print('Skipping invalid ' + warc_path) continue for chunk in StreamIter(fh): yield chunk response.headers['Content-Type'] = 'application/octet-stream' response.headers[ 'Content-Disposition'] = "attachment; filename*=UTF-8''" + filename # if not transfer-encoding, store infos and calculate total size if not self.download_chunk_encoded: size = len(coll_info) infos = list(iter_infos()) size += sum(size for r, i, size in infos) response.headers['Content-Length'] = size return read_all(infos) else: # stream everything response.headers['Transfer-Encoding'] = 'chunked' return read_all(iter_infos())
def write_snapshot(self, user, coll, url, title, html_text, referrer, user_agent, browser=None): snap_title = 'Static Snapshots' snap_rec = self.sanitize_title(snap_title) if not self.manager.has_recording(user, coll, snap_rec): recording = self.manager.create_recording(user, coll, snap_rec, snap_title) kwargs = dict(user=user, coll=quote(coll), rec=quote(snap_rec, safe='/*'), type='snapshot') params = {'url': url} upstream_url = self.manager.content_app.get_upstream_url( '', kwargs, params) headers = { 'Content-Type': 'text/html; charset=utf-8', 'WARC-User-Agent': user_agent, 'WARC-Referer': referrer, } r = requests.put( upstream_url, data=BytesIO(html_text.encode('utf-8')), headers=headers, ) try: res = r.json() if res['success'] != 'true': print(res) return {'error_message': 'Snapshot Failed'} warc_date = res.get('WARC-Date') except Exception as e: print(e) return {'error_message': 'Snapshot Failed'} if not title: return {'snapshot': ''} if warc_date: timestamp = iso_date_to_timestamp(warc_date) else: timestamp = timestamp_now() page_data = { 'url': url, 'title': title, 'timestamp': timestamp, 'tags': ['snapshot'], } if browser: page_data['browser'] = browser res = self.manager.add_page(user, coll, snap_rec, page_data) return {'snapshot': page_data}