def proxy_route_request(self, url, environ): try: key = 'up:' + environ['REMOTE_ADDR'] timestamp, coll, mode, cache = self.redis.hmget( key, 'timestamp', 'coll', 'mode', 'cache') #environ['pywb_redis_key'] = key environ[ 'pywb_proxy_default_timestamp'] = timestamp or timestamp_now() environ['pywb_cache'] = cache if coll: self.ensure_coll_exists(coll) if mode == 'replay' or coll == 'live': proxy_prefix = '/' + coll + '/bn_/' else: proxy_prefix = '/' + coll + '/record/bn_/' else: proxy_prefix = self.proxy_prefix except Exception as e: traceback.print_exc() return proxy_prefix + url
def ingest(self, text, params): # text already parsed content = text.decode('utf-8') title = params.get('title') or params.get('url') url = params.get('url') timestamp_s = params.get('timestamp') or timestamp_now() timestamp_dt = timestamp_to_iso_date(timestamp_s) has_screenshot_b = params.get('hasScreenshot') == '1' title = title or url digest = self.get_digest(content) #if self.update_if_dupe(digest, coll, url, timestamp_ss, timestamp_dts): # return data = { 'user_s': params.get('user'), 'coll_s': params.get('coll'), 'rec_s': params.get('rec'), 'id': params.get('pid'), 'title_t': title, 'content_t': content, 'url_s': url, 'digest_s': digest, 'timestamp_s': timestamp_s, 'timestamp_dt': timestamp_dt, 'has_screenshot_b': has_screenshot_b, } result = requests.post(self.solr_api, json=data)
def write_dns(self, host, dns): # write it out even if empty # TODO: we filter the addresses early, should we warc the unfiltered dns repsonse? # the response object doesn't contain the query type 'A' or 'AAAA' # but it has family=2 AF_INET (ipv4) and flags=4 AI_NUMERICHOST -- that's 'A' kind = 'A' if self.writer is None: self.open() payload = timestamp_now() + '\r\n' for r in dns: try: payload += host + '.\t' + str( r.ttl) + '\tIN\t' + kind + '\t' + r.host + '\r\n' except Exception as e: LOGGER.info('problem converting dns reply for warcing', r, e) pass payload = payload.encode('utf-8') record = self.writer.create_warc_record('dns:' + host, 'resource', payload=BytesIO(payload), warc_content_type='text/dns', length=len(payload)) self.writer.write_record(record) LOGGER.debug('wrote warc dns response record%s for host %s', p(self.prefix), host) stats.stats_sum('warc dns' + p(self.prefix), 1)
def write_cdxj(self, user, cdxj_key): #full_filename = self.redis.hget(warc_key, self.INDEX_FILE_KEY) full_filename = self.get_prop(self.INDEX_FILE_KEY) if full_filename: cdxj_filename = os.path.basename(strip_prefix(full_filename)) return cdxj_filename, full_filename dirname = user.get_user_temp_warc_path() randstr = base64.b32encode(os.urandom(5)).decode('utf-8') timestamp = timestamp_now() cdxj_filename = self.INDEX_NAME_TEMPL.format(timestamp=timestamp, random=randstr) os.makedirs(dirname, exist_ok=True) full_filename = os.path.join(dirname, cdxj_filename) cdxj_list = self.redis.zrange(cdxj_key, 0, -1) with open(full_filename, 'wt') as out: for cdxj in cdxj_list: out.write(cdxj + '\n') out.flush() full_url = add_local_store_prefix( full_filename.replace(os.path.sep, '/')) #self.redis.hset(warc_key, self.INDEX_FILE_KEY, full_url) self.set_prop(self.INDEX_FILE_KEY, full_url) return cdxj_filename, full_filename
def load_index(self, params): # no fuzzy match for live resources if params.get('is_fuzzy'): raise NotFoundException(params['url'] + '*') cdx = CDXObject() cdx['urlkey'] = params.get('key').decode('utf-8') cdx['timestamp'] = timestamp_now() cdx['url'] = params['url'] cdx['load_url'] = self.get_load_url(params) cdx['is_live'] = 'true' mime = params.get('content_type', '') if params.get('filter') and not mime: try: res = self.sesh.head(cdx['load_url']) if res.status_code != 405: cdx['status'] = str(res.status_code) content_type = res.headers.get('Content-Type') if content_type: mime = content_type.split(';')[0] except Exception as e: pass cdx['mime'] = mime return iter([cdx])
def test_live(self): url = 'http://example.com/' source = LiveIndexSource() res, errs = self.query_single_source(source, dict(url=url)) expected = 'com,example)/ {0} http://example.com/'.format(timestamp_now()) assert(key_ts_res(res, 'load_url') == expected) assert(errs == {})
def proxy_route_request(self, url, environ): try: key = 'up:' + environ['REMOTE_ADDR'] timestamp = self.redis.hget(key, 'timestamp') or timestamp_now() environ['pywb_redis_key'] = key environ['pywb_proxy_default_timestamp'] = timestamp except Exception as e: traceback.print_exc() return self.proxy_prefix + url
def load_index(self, params): cdx = CDXObject() cdx['urlkey'] = params.get('key').decode('utf-8') closest = params.get('closest') cdx['timestamp'] = closest if closest else timestamp_now() cdx['url'] = params['url'] cdx['load_url'] = res_template(self.proxy_url, params) cdx['memento_url'] = cdx['load_url'] return self._do_load(cdx, params)
def get_top_frame(self, wb_url, wb_prefix, host_prefix, env, frame_mod, replay_mod, coll='', extra_params=None): """ :param rewrite.wburl.WbUrl wb_url: The WbUrl for the request this template is being rendered for :param str wb_prefix: The URL prefix pywb is serving the content using (e.g. http://localhost:8080/live/) :param str host_prefix: The host URL prefix pywb is running on (e.g. http://localhost:8080) :param dict env: The WSGI environment dictionary for the request this template is being rendered for :param str frame_mod: The modifier to be used for framing (e.g. if_) :param str replay_mod: The modifier to be used in the URL of the page being replayed (e.g. mp_) :param str coll: The name of the collection this template is being rendered for :param dict extra_params: Additional parameters to be supplied to the Jninja template render method :return: The frame insert string :rtype: str """ embed_url = wb_url.to_str(mod=replay_mod) if wb_url.timestamp: timestamp = wb_url.timestamp else: timestamp = timestamp_now() is_proxy = 'wsgiprox.proxy_host' in env params = { 'host_prefix': host_prefix, 'wb_prefix': wb_prefix, 'wb_url': wb_url, 'coll': coll, 'options': { 'frame_mod': frame_mod, 'replay_mod': replay_mod }, 'embed_url': embed_url, 'is_proxy': is_proxy, 'timestamp': timestamp, 'url': wb_url.get_url() } if extra_params: params.update(extra_params) if self.banner_view: banner_html = self.banner_view.render_to_string(env, **params) params['banner_html'] = banner_html return self.render_to_string(env, **params)
def get_top_frame(self, wb_url, wb_prefix, host_prefix, env, frame_mod, replay_mod, coll='', extra_params=None): """ :param rewrite.wburl.WbUrl wb_url: The WbUrl for the request this template is being rendered for :param str wb_prefix: The URL prefix pywb is serving the content using (e.g. http://localhost:8080/live/) :param str host_prefix: The host URL prefix pywb is running on (e.g. http://localhost:8080) :param dict env: The WSGI environment dictionary for the request this template is being rendered for :param str frame_mod: The modifier to be used for framing (e.g. if_) :param str replay_mod: The modifier to be used in the URL of the page being replayed (e.g. mp_) :param str coll: The name of the collection this template is being rendered for :param dict extra_params: Additional parameters to be supplied to the Jninja template render method :return: The frame insert string :rtype: str """ embed_url = wb_url.to_str(mod=replay_mod) if wb_url.timestamp: timestamp = wb_url.timestamp else: timestamp = timestamp_now() is_proxy = 'wsgiprox.proxy_host' in env params = {'host_prefix': host_prefix, 'wb_prefix': wb_prefix, 'wb_url': wb_url, 'coll': coll, 'options': {'frame_mod': frame_mod, 'replay_mod': replay_mod}, 'embed_url': embed_url, 'is_proxy': is_proxy, 'timestamp': timestamp, 'url': wb_url.get_url() } if extra_params: params.update(extra_params) if self.banner_view: banner_html = self.banner_view.render_to_string(env, **params) params['banner_html'] = banner_html return self.render_to_string(env, **params)
def __call__(self, params): if params.get('closest') == 'now': params['closest'] = timestamp_now() content_type = params.get('content_type') if content_type: params['filter'] = '=mime:' + content_type query = CDXQuery(params) cdx_iter, errs = self.load_index(query.params) cdx_iter = process_cdx(cdx_iter, query) return cdx_iter, dict(errs)
def get_top_frame(self, wb_url, wb_prefix, host_prefix, env, frame_mod, replay_mod, coll='', extra_params=None): embed_url = wb_url.to_str(mod=replay_mod) if wb_url.timestamp: timestamp = wb_url.timestamp else: timestamp = timestamp_now() is_proxy = 'wsgiprox.proxy_host' in env params = { 'host_prefix': host_prefix, 'wb_prefix': wb_prefix, 'wb_url': wb_url, 'coll': coll, 'options': { 'frame_mod': frame_mod, 'replay_mod': replay_mod }, 'embed_url': embed_url, 'is_proxy': is_proxy, 'timestamp': timestamp, 'url': wb_url.get_url() } if extra_params: params.update(extra_params) if self.banner_view: banner_html = self.banner_view.render_to_string(env, **params) params['banner_html'] = banner_html return self.render_to_string(env, **params)
def write_dns(self, dns, ttl, url): # write it out even if empty # TODO: we filter the addresses early, should we warc the unfiltered dns repsonse? # the response object doesn't contain the query type 'A' or 'AAAA' # but it has family=2 AF_INET (ipv4) and flags=4 AI_NUMERICHOST -- that's 'A' kind = 'A' # fixme IPV6 ttl = int(ttl) host = url.hostname if self.writer is None: self.open() payload = timestamp_now() + '\r\n' for r in dns: try: payload += '\t'.join( (host + '.', str(ttl), 'IN', kind, r['host'])) + '\r\n' except Exception as e: LOGGER.info('problem converting dns reply for warcing', host, r, e) pass payload = payload.encode('utf-8') warc_headers_dict = OrderedDict() warc_headers_dict['WARC-Warcinfo-ID'] = self.warcinfo_id record = self.writer.create_warc_record( 'dns:' + host, 'resource', warc_content_type='text/dns', payload=BytesIO(payload), length=len(payload), warc_headers_dict=warc_headers_dict) self.writer.write_record(record) LOGGER.debug('wrote warc dns response record%s for host %s', p(self.prefix), host) stats.stats_sum('warc dns' + p(self.prefix), 1)
def _get_pagedata(self, user, coll, rec, pagedata): key = self.page_key.format(user=user, coll=coll, rec=rec) url = pagedata['url'] ts = pagedata.get('timestamp') if not ts: ts = pagedata.get('ts') if not ts: ts = self._get_url_ts(user, coll, rec, url) if not ts: ts = timestamp_now() pagedata['timestamp'] = ts pagedata_json = json.dumps(pagedata) hkey = pagedata['url'] + ' ' + pagedata['timestamp'] return key, hkey, pagedata_json
def process_cdxj_key(self, cdxj_key): base_key = cdxj_key.rsplit(':cdxj', 1)[0] if self.redis.exists(base_key + ':open'): return _, user, coll, rec = base_key.split(':', 3) user_dir = os.path.join(self.record_root_dir, user) warc_key = base_key + ':warc' warcs = self.redis.hgetall(warc_key) info_key = base_key + ':info' self.redis.publish('close_rec', info_key) try: timestamp = sec_to_timestamp( int(self.redis.hget(info_key, 'updated_at'))) except: timestamp = timestamp_now() cdxj_filename = self.write_cdxj(warc_key, user_dir, cdxj_key, timestamp) all_done = self.commit_file(user, coll, rec, user_dir, cdxj_filename, 'indexes', warc_key, cdxj_filename, self.info_index_key) for warc_filename in warcs.keys(): value = warcs[warc_filename] done = self.commit_file(user, coll, rec, user_dir, warc_filename, 'warcs', warc_key, value) all_done = all_done and done if all_done: print('Deleting Redis Key: ' + cdxj_key) self.redis.delete(cdxj_key) self.remove_if_empty(user_dir)
def write_cdxj(self, user, cdxj_key): """Write CDX index lines to file. :param RedisUniqueComponent user: user :param str cdxj_key: CDX index file Redis key :returns: CDX file filename and path :rtype: str and str """ #full_filename = self.redis.hget(warc_key, self.INDEX_FILE_KEY) full_filename = self.get_prop(self.INDEX_FILE_KEY) if full_filename: cdxj_filename = os.path.basename(strip_prefix(full_filename)) return cdxj_filename, full_filename dirname = user.get_user_temp_warc_path() randstr = base64.b32encode(os.urandom(5)).decode('utf-8') timestamp = timestamp_now() cdxj_filename = self.INDEX_NAME_TEMPL.format(timestamp=timestamp, random=randstr) os.makedirs(dirname, exist_ok=True) full_filename = os.path.join(dirname, cdxj_filename) cdxj_list = self.redis.zrange(cdxj_key, 0, -1) with open(full_filename, 'wt') as out: for cdxj in cdxj_list: out.write(cdxj + '\n') out.flush() full_url = add_local_store_prefix(full_filename.replace(os.path.sep, '/')) #self.redis.hset(warc_key, self.INDEX_FILE_KEY, full_url) self.set_prop(self.INDEX_FILE_KEY, full_url) return cdxj_filename, full_filename
def get_transclusion_metadata(self, tc, url, orig_mime=None): contain_url = tc['url'] contain_ts = tc.get('timestamp') or timestamp_now() contain_ts = str(contain_ts) selector = tc.get('selector') if tc.get('metadata_file'): with open(tc.get('metadata_file'), 'rt') as fh: metadata = fh.read() else: all_metadata = {} all_metadata['webpage_url'] = contain_url all_metadata['webpage_timestamp'] = contain_ts if selector: all_metadata['selector'] = selector formats = [] if self.conversion_serializer: for file_info, _, metadata in self.conversion_serializer.find_conversions( url): metadata['url'] = file_info.url metadata['original_url'] = url formats.append(metadata) orig_format = { 'url': url, 'ext': url.rsplit('.')[-1], 'original': True, } if orig_mime: orig_format['mime'] = orig_mime formats.append(orig_format) all_metadata['formats'] = formats return contain_url, contain_ts, all_metadata
def write_snapshot(self, user, collection, url, title, html_text, referrer, user_agent, browser=None): snap_title = 'Static Snapshots' snap_rec_name = self.sanitize_title(snap_title) recording = collection.get_recording_by_name(snap_rec_name) if not recording: recording = collection.create_recording(snap_rec_name, title=snap_rec_name) kwargs = dict(user=user.name, coll=collection.my_id, rec=quote(snap_rec_name, safe='/*'), type='put_record') params = {'url': url} upstream_url = self.content_app.get_upstream_url('', kwargs, params) headers = {'Content-Type': 'text/html; charset=utf-8', 'WARC-User-Agent': user_agent, 'WARC-Referer': referrer, } r = requests.put(upstream_url, data=BytesIO(html_text.encode('utf-8')), headers=headers, ) try: res = r.json() if res['success'] != 'true': print(res) return {'error_message': 'Snapshot Failed'} warc_date = res.get('WARC-Date') except Exception as e: print(e) return {'error_message': 'Snapshot Failed'} if not title: return {'snapshot': ''} if warc_date: timestamp = iso_date_to_timestamp(warc_date) else: timestamp = timestamp_now() page_data = {'url': url, 'title': title, 'timestamp': timestamp, 'tags': ['snapshot'], } if browser: page_data['browser'] = browser res = recording.add_page(page_data) return {'snapshot': page_data}
def handle_download(self, user, coll_name, recs): user, collection = self.user_manager.get_user_coll(user, coll_name) if not collection: self._raise_error(404, 'no_such_collection') if not self.access.is_superuser(): self.access.assert_can_write_coll(collection) # collection['uid'] = coll collection.load() Stats(self.redis).incr_download(collection) now = timestamp_now() name = coll_name if recs != '*': rec_list = recs.split(',') if len(rec_list) == 1: name = recs else: name += '-' + recs else: rec_list = None filename = self.download_filename.format(title=quote(name), timestamp=now) loader = BlockLoader() coll_info = self.create_coll_warcinfo(user, collection, filename) def iter_infos(): for recording in collection.get_recordings(load=True): if rec_list and recording.name not in rec_list: continue warcinfo = self.create_rec_warcinfo(user, collection, recording, filename) size = len(warcinfo) size += recording.size yield recording, warcinfo, size def read_all(infos): yield coll_info for recording, warcinfo, _ in infos: yield warcinfo for n, warc_path in recording.iter_all_files(): try: fh = loader.load(warc_path) except Exception: print('Skipping invalid ' + warc_path) continue for chunk in StreamIter(fh): yield chunk response.headers['Content-Type'] = 'application/octet-stream' response.headers[ 'Content-Disposition'] = "attachment; filename*=UTF-8''" + filename # if not transfer-encoding, store infos and calculate total size if not self.download_chunk_encoded: size = len(coll_info) infos = list(iter_infos()) size += sum(size for r, i, size in infos) response.headers['Content-Length'] = size return read_all(infos) else: # stream everything response.headers['Transfer-Encoding'] = 'chunked' return read_all(iter_infos())
def write_snapshot(self, user, coll, url, title, html_text, referrer, user_agent, browser=None): snap_title = 'Static Snapshots' snap_rec = self.sanitize_title(snap_title) if not self.manager.has_recording(user, coll, snap_rec): recording = self.manager.create_recording(user, coll, snap_rec, snap_title) kwargs = dict(user=user, coll=quote(coll), rec=quote(snap_rec, safe='/*'), type='snapshot') params = {'url': url} upstream_url = self.manager.content_app.get_upstream_url( '', kwargs, params) headers = { 'Content-Type': 'text/html; charset=utf-8', 'WARC-User-Agent': user_agent, 'WARC-Referer': referrer, } r = requests.put( upstream_url, data=BytesIO(html_text.encode('utf-8')), headers=headers, ) try: res = r.json() if res['success'] != 'true': print(res) return {'error_message': 'Snapshot Failed'} warc_date = res.get('WARC-Date') except Exception as e: print(e) return {'error_message': 'Snapshot Failed'} if not title: return {'snapshot': ''} if warc_date: timestamp = iso_date_to_timestamp(warc_date) else: timestamp = timestamp_now() page_data = { 'url': url, 'title': title, 'timestamp': timestamp, 'tags': ['snapshot'], } if browser: page_data['browser'] = browser res = self.manager.add_page(user, coll, snap_rec, page_data) return {'snapshot': page_data}
def handle_download(self, user, coll_name, recs): user, collection = self.user_manager.get_user_coll(user, coll_name) if not collection: self._raise_error(404, 'no_such_collection') self.access.assert_can_write_coll(collection) #collection['uid'] = coll collection.load() Stats(self.redis).incr_download(collection) now = timestamp_now() name = coll_name if recs != '*': rec_list = recs.split(',') if len(rec_list) == 1: name = recs else: name += '-' + recs else: rec_list = None filename = self.download_filename.format(title=quote(name), timestamp=now) loader = BlockLoader() coll_info = self.create_coll_warcinfo(user, collection, filename) def iter_infos(): for recording in collection.get_recordings(load=True): if rec_list and recording.name not in rec_list: continue warcinfo = self.create_rec_warcinfo(user, collection, recording, filename) size = len(warcinfo) size += recording.size yield recording, warcinfo, size def read_all(infos): yield coll_info for recording, warcinfo, _ in infos: yield warcinfo for n, warc_path in recording.iter_all_files(): try: fh = loader.load(warc_path) except: print('Skipping invalid ' + warc_path) continue for chunk in StreamIter(fh): yield chunk response.headers['Content-Type'] = 'application/octet-stream' response.headers['Content-Disposition'] = "attachment; filename*=UTF-8''" + filename # if not transfer-encoding, store infos and calculate total size if not self.download_chunk_encoded: size = len(coll_info) infos = list(iter_infos()) size += sum(size for r, i, size in infos) response.headers['Content-Length'] = size return read_all(infos) else: # stream everything response.headers['Transfer-Encoding'] = 'chunked' return read_all(iter_infos())
def handle_download(self, user, coll, rec): collection = self.manager.get_collection(user, coll, rec) if not collection: self._raise_error(404, 'Collection not found', id=coll) now = timestamp_now() name = collection['id'] if rec != '*': rec_list = rec.split(',') if len(rec_list) == 1: name = rec else: name += '-' + rec else: rec_list = None filename = self.download_filename.format(title=quote(name), timestamp=now) loader = BlockLoader() coll_info = self.create_coll_warcinfo(user, collection, filename) def iter_infos(): for recording in collection['recordings']: if rec_list and recording['id'] not in rec_list: continue warcinfo = self.create_rec_warcinfo(user, collection, recording, filename) size = len(warcinfo) size += recording['size'] yield recording, warcinfo, size def read_all(infos): yield coll_info for recording, warcinfo, _ in infos: yield warcinfo for warc_path in self._iter_all_warcs(user, coll, recording['id']): try: fh = loader.load(warc_path) except: print('Skipping invalid ' + warc_path) continue for chunk in StreamIter(fh): yield chunk response.headers['Content-Type'] = 'application/octet-stream' response.headers[ 'Content-Disposition'] = "attachment; filename*=UTF-8''" + filename # if not transfer-encoding, store infos and calculate total size if not self.download_chunk_encoded: size = len(coll_info) infos = list(iter_infos()) size += sum(size for r, i, size in infos) response.headers['Content-Length'] = size return read_all(infos) else: # stream everything response.headers['Transfer-Encoding'] = 'chunked' return read_all(iter_infos())