def merge_request_data(self, other, options): surt_ordered = options.get('surt_ordered', True) if other.record.rec_type != 'request': return False # two requests, not correct if self.record.rec_type == 'request': return False # merge POST/PUT body query post_query = other.get('_post_query') url = self['url'] new_url = post_query.append_query(url) new_url = new_url.replace('WB_wombat_', '') if post_query and new_url != url: self['urlkey'] = canonicalize(new_url, surt_ordered) other['urlkey'] = self['urlkey'] self['method'] = post_query.method self['requestBody'] = post_query.query referer = other.record.http_headers.get_header('referer') if referer: self['_referer'] = referer return True
def rewrite_record(self, headers, content, ts, url='http://example.com/', prefix='http://localhost:8080/prefix/', warc_headers=None, request_url=None, is_live=None): record = self._create_response_record(url, headers, content, warc_headers) wburl = WbUrl(ts + '/' + (request_url or url)) url_rewriter = UrlRewriter(wburl, prefix) cdx = CDXObject() cdx['url'] = url cdx['timestamp'] = ts cdx['urlkey'] = canonicalize(url) if request_url != url: cdx['is_fuzzy'] = '1' cdx['is_live'] = is_live return self.content_rewriter(record, url_rewriter, None, cdx=cdx)
def rewrite_record(self, headers, content, ts, url='http://example.com/', prefix='http://localhost:8080/prefix/', warc_headers=None, request_url=None, is_live=None, use_js_proxy=True, environ=None): record = self._create_response_record(url, headers, content, warc_headers) wburl = WbUrl(ts + '/' + (request_url or url)) url_rewriter = UrlRewriter(wburl, prefix) cdx = CDXObject() cdx['url'] = url cdx['timestamp'] = ts cdx['urlkey'] = canonicalize(url) if request_url != url: cdx['is_fuzzy'] = '1' cdx['is_live'] = is_live def insert_func(rule, cdx): return '' if use_js_proxy: rewriter = self.js_proxy_content_rewriter else: rewriter = self.content_rewriter return rewriter(record, url_rewriter, cookie_rewriter=None, head_insert_func=insert_func, cdx=cdx, environ=environ)
def handle_timegate(self, params, timestamp): url = params['url'] load_url = self.timegate_url.format(url=url, timestamp=timestamp) res = None try: headers = self._get_headers(params) res = self.sesh.head(load_url, headers=headers) except Exception as e: no_except_close(res) raise NotFoundException(url) if res and res.headers.get('Memento-Datetime'): if res.status_code >= 400: no_except_close(res) raise NotFoundException(url) if res.status_code >= 300: info = self._extract_location(url, res.headers.get('Location')) else: info = self._extract_location( url, res.headers.get('Content-Location')) url, timestamp, load_url = info cdx = CDXObject() cdx['urlkey'] = canonicalize(url) cdx['timestamp'] = timestamp cdx['url'] = url cdx['load_url'] = load_url if 'Referer' in headers: cdx['set_referrer'] = headers['Referer'] return iter([cdx])
def create_record_iter(self, arcv_iter): append_post = self.options.get('append_post') include_all = self.options.get('include_all') block_size = self.options.get('block_size', 16384) surt_ordered = self.options.get('surt_ordered', True) minimal = self.options.get('minimal') append_post = self.options.get('append_post') if append_post and minimal: raise Exception('Sorry, minimal index option and ' + 'append POST options can not be used together') for record in arcv_iter.iter_records(block_size): entry = None if not include_all and not minimal and ( record.status_headers.get_statuscode() == '-'): continue if record.format == 'warc': if (record.rec_type in ('request', 'warcinfo') and not include_all and not append_post): continue elif (not include_all and record.content_type == 'application/warc-fields'): continue entry = self.parse_warc_record(record) elif record.format == 'arc': entry = self.parse_arc_record(record) if not entry: continue if entry.get('url') and not entry.get('urlkey'): entry['urlkey'] = canonicalize(entry['url'], surt_ordered) compute_digest = False if (entry.get('digest', '-') == '-' and record.rec_type not in ('revisit', 'request', 'warcinfo')): compute_digest = True elif not minimal and record.rec_type == 'request' and append_post: method = record.status_headers.protocol len_ = record.status_headers.get_header('Content-Length') post_query = extract_post_query(method, entry.get('mime'), len_, record.stream) entry['_post_query'] = post_query arcv_iter.read_to_end(record, compute_digest) entry.set_rec_info(*arcv_iter.member_info) entry.record = record yield entry
def get_params(self, url, actual_url, mime='text/html'): params = { 'url': url, 'cdx_url': actual_url, 'key': canonicalize(url), 'mime': mime } return params
def create_record_iter(arcv_iter, options): append_post = options.get('append_post') include_all = options.get('include_all') block_size = options.get('block_size', 16384) for record in arcv_iter.iter_records(block_size): entry = None if not include_all and (record.status_headers.get_statuscode() == '-'): continue if record.format == 'warc': if (record.rec_type in ('request', 'warcinfo') and not include_all and not append_post): continue elif (not include_all and record.content_type == 'application/warc-fields'): continue entry = parse_warc_record(record) elif record.format == 'arc': entry = parse_arc_record(record) if not entry: continue if entry.url and not entry.key: entry.key = canonicalize(entry.url, options.get('surt_ordered', True)) compute_digest = False if (entry.digest == '-' and record.rec_type not in ('revisit', 'request', 'warcinfo')): compute_digest = True elif record.rec_type == 'request' and options.get('append_post'): method = record.status_headers.protocol len_ = record.status_headers.get_header('Content-Length') post_query = extract_post_query(method, entry.mime, len_, record.stream) entry.post_query = post_query #entry.set_rec_info(*arcv_iter.read_to_end(record, compute_digest)) arcv_iter.read_to_end(record, compute_digest) entry.set_rec_info(*arcv_iter.member_info) entry.record = record yield entry
def fetch_request(self, url, urlrewriter, head_insert_func=None, urlkey=None, env=None, req_headers={}, timestamp=None, follow_redirects=False, proxies=None): ts_err = url.split('///') # fixup for accidental erroneous rewrite which has /// # (unless file:///) if len(ts_err) > 1 and ts_err[0] != 'file:': url = 'http://' + ts_err[1] if url.startswith('//'): url = 'http:' + url if is_http(url): (status_headers, stream) = self.fetch_http(url, env, req_headers, follow_redirects, proxies) else: (status_headers, stream) = self.fetch_local_file(url) # explicit urlkey may be passed in (say for testing) if not urlkey: urlkey = canonicalize(url) if timestamp is None: timestamp = datetime_to_timestamp(datetime.datetime.utcnow()) cdx = { 'urlkey': urlkey, 'timestamp': timestamp, 'original': url, 'statuscode': status_headers.get_statuscode(), 'mimetype': status_headers.get_header('Content-Type'), 'is_live': True, } result = (self.rewriter.rewrite_content( urlrewriter, status_headers, stream, head_insert_func=head_insert_func, urlkey=urlkey, cdx=cdx)) if env: env['pywb.cdx'] = cdx return result
def __init__(self, url): self.url = url split = canonicalize(url).split(')/') dd = split[0].split(',') if len(dd) > 2: self.domain = ','.join(dd[:2]) else: self.domain = split[0] self.path = split[1] self.pathLen = len(self.path.split('/'))
def get_expected(self, url, mime='text/html', filters=None): filters = filters or {'urlkey:'} exp = [{'filter': filters, 'is_fuzzy': '1', 'urlkey': canonicalize(url), 'source': 'source', 'source-coll': 'source', 'url': url, 'mime': mime}] return exp
def render_content(self, wbrequest): if wbrequest.wb_url.mod == 'vi_': return self._get_video_info(wbrequest) ref_wburl_str = wbrequest.extract_referrer_wburl_str() if ref_wburl_str: wbrequest.env['HTTP_REFERER'] = WbUrl(ref_wburl_str).url urlkey = canonicalize(wbrequest.wb_url.url) url = wbrequest.wb_url.url inputreq = RewriteInputRequest(wbrequest.env, urlkey, url, self.content_rewriter) req_data = inputreq.reconstruct_request(url) headers = {'Content-Length': len(req_data), 'Content-Type': 'application/request'} if wbrequest.wb_url.is_latest_replay(): closest = 'now' else: closest = wbrequest.wb_url.timestamp upstream_url = self.upstream_url.format(url=quote(url), closest=closest, #coll=wbrequest.coll, **wbrequest.matchdict) r = requests.post(upstream_url, data=BytesIO(req_data), headers=headers, stream=True, allow_redirects=False) r.raise_for_status() record = self.loader.parse_record_stream(r.raw) cdx = CDXObject() cdx['urlkey'] = urlkey cdx['timestamp'] = http_date_to_timestamp(r.headers.get('Memento-Datetime')) cdx['url'] = url head_insert_func = self.head_insert_view.create_insert_func(wbrequest) result = self.content_rewriter.rewrite_content(wbrequest.urlrewriter, record.status_headers, record.stream, head_insert_func, urlkey, cdx) status_headers, gen, is_rw = result return self._make_response(wbrequest, *result)
def fetch_request(self, url, urlrewriter, head_insert_func=None, urlkey=None, env=None, req_headers={}, timestamp=None, follow_redirects=False, proxies=None): ts_err = url.split('///') # fixup for accidental erroneous rewrite which has /// # (unless file:///) if len(ts_err) > 1 and ts_err[0] != 'file:': url = 'http://' + ts_err[1] if url.startswith('//'): url = 'http:' + url if is_http(url): (status_headers, stream) = self.fetch_http(url, env, req_headers, follow_redirects, proxies) else: (status_headers, stream) = self.fetch_local_file(url) # explicit urlkey may be passed in (say for testing) if not urlkey: urlkey = canonicalize(url) if timestamp is None: timestamp = datetime_to_timestamp(datetime.datetime.utcnow()) cdx = {'urlkey': urlkey, 'timestamp': timestamp, 'original': url, 'statuscode': status_headers.get_statuscode(), 'mimetype': status_headers.get_header('Content-Type'), 'is_live': True, } result = (self.rewriter. rewrite_content(urlrewriter, status_headers, stream, head_insert_func=head_insert_func, urlkey=urlkey, cdx=cdx)) if env: env['pywb.cdx'] = cdx return result
def canonize_grounding(instance): if "grounding_urls" in instance: try: instance["grounding_urls"] = [ url.lower() for url in instance["grounding_urls"] ] instance["grounding_canonical_urls"] = [ canonicalize(url) for url in instance["grounding_urls"] ] except Exception as e: logging.error("Could not canonize grounding URLs: " + str(e)) return yield instance
def convert_to_cdx(self, item, urlkey, url): cdx = CDXObject() cdx['urlkey'] = canonicalize(url) cdx['timestamp'] = gettext(item, 'tstamp')[:14] cdx['url'] = url cdx['mime'] = gettext(item, 'primaryType') + '/' + gettext(item, 'subType') cdx['status'] = '-' cdx['digest'] = gettext(item, 'digest') #cdx['length'] = gettext(item, 'contentLength') cdx['length'] = '-' cdx['offset'] = gettext(item, 'arcoffset') cdx['filename'] = gettext(item, 'arcname') + '.arc.gz' return cdx
def load_index(self, params): # return nothing for exact match to force fuzzy if params.get('matchType', 'exact') == 'exact': return iter([]) cdx = { 'urlkey': canonicalize(params.get('cdx_url')), 'mime': params.get('mime'), 'filter': params.get('filter'), 'url': params.get('cdx_url'), } return iter([cdx])
def memento_to_cdx(self, url, mem_iter, limit, skip_exclude=True): key = canonicalize(url) if url.endswith('/'): key += '/' for mems, _ in itertools.izip(mem_iter, xrange(0, limit)): if len(mems) > 1: mem, next_, prev_, first_, last_ = mems else: mem = mems[0] excluded = False if isinstance(mem.url, list): mem_list = mem.url count = len(mem_list) if count > 1: mem_list = self.sort_archives(mem_list) else: mem_list = [mem.url] count = 1 for mem_url in mem_list: mem_url = mem_url.encode('utf-8') # handle scheme relative urls if mem_url.startswith('//'): mem_url = 'http:' + mem_url if mem_url.startswith(EXCLUDE_LIST): if skip_exclude: continue else: excluded = True cdx = {} cdx['urlkey'] = key cdx['timestamp'] = mem.ts cdx['url'] = url cdx['src_url'] = mem_url cdx['sec'] = mem.sec cdx['src_host'] = urlsplit(mem_url).netloc cdx['excluded'] = excluded cdx['dupes'] = count if len(mems) > 1: cdx['first'] = first_.ts if first_ else '' cdx['last'] = last_.ts if last_ else '' cdx['next'] = next_.ts if next_ else '' cdx['prev'] = prev_.ts if prev_ else '' yield cdx
def memento_to_cdx(self, url, mem_iter, limit, skip_exclude=True): key = canonicalize(url) if url.endswith("/"): key += "/" for mems, _ in itertools.izip(mem_iter, xrange(0, limit)): if len(mems) > 1: mem, next_, prev_, first_, last_ = mems else: mem = mems[0] excluded = False if isinstance(mem.url, list): mem_list = mem.url count = len(mem_list) if count > 1: mem_list = self.sort_archives(mem_list) else: mem_list = [mem.url] count = 1 for mem_url in mem_list: mem_url = mem_url.encode("utf-8") # handle scheme relative urls if mem_url.startswith("//"): mem_url = "http:" + mem_url if mem_url.startswith(EXCLUDE_LIST): if skip_exclude: continue else: excluded = True cdx = {} cdx["urlkey"] = key cdx["timestamp"] = mem.ts cdx["url"] = url cdx["src_url"] = mem_url cdx["sec"] = mem.sec cdx["src_host"] = urlsplit(mem_url).netloc cdx["excluded"] = excluded cdx["dupes"] = count if len(mems) > 1: cdx["first"] = first_.ts if first_ else "" cdx["last"] = last_.ts if last_ else "" cdx["next"] = next_.ts if next_ else "" cdx["prev"] = prev_.ts if prev_ else "" yield cdx
def convert_to_cdx(self, item, urlkey, url): cdx = CDXObject() cdx['urlkey'] = canonicalize(url) cdx['timestamp'] = gettext(item, 'tstamp')[:14] cdx['url'] = url cdx['mime'] = gettext(item, 'primaryType') + '/' + gettext( item, 'subType') cdx['status'] = '-' cdx['digest'] = gettext(item, 'digest') #cdx['length'] = gettext(item, 'contentLength') cdx['length'] = '-' cdx['offset'] = gettext(item, 'arcoffset') cdx['filename'] = gettext(item, 'arcname') + '.arc.gz' return cdx
def convert_line(self, line, url): timestamp, mime, filename = line.split('\t') cdx = CDXObject() cdx['urlkey'] = canonicalize(url) cdx['timestamp'] = timestamp cdx['original'] = url cdx['mimetype'] = mime cdx['statuscode'] = '200' cdx['digest'] = '-' cdx['length'] = '-1' cdx['offset'] = '0' cdx['filename'] = filename return cdx
def load_cdx(self, **params): closest = params.get('closest') self.check_url(params) if closest: query = self._get_closest_query(params) else: query = self._get_timemap_query(params) query = quote_plus(query) + self.CLOSEST_QUERY_FIXED full_url = self.opensearch_query + '?query=' + query print('QUERY', full_url) output = params.get('output', 'text') url = params.get('url') urlkey = canonicalize(url) try: response = requests.get(full_url, stream=True) buff = response.raw.read() response.raw.close() except Exception as e: import traceback traceback.print_exc(e) raise WbException(e) results = etree.fromstring(buff) items = results.find('channel').findall('item') cdx_list = [self.convert_to_cdx(item, urlkey, url) for item in items] if not cdx_list: raise NotFoundException('url {0} not found'.format(url)) if closest: cdx_list = cdx_sort_closest(closest, cdx_list, limit=10000) #lets print the list and the closest for debug else: cdx_list = cdx_sort_closest(EARLIEST_DATE, cdx_list, limit=10000) if output == 'text': cdx_list = [str(cdx) + '\n' for cdx in cdx_list] elif output == 'json': fields = params.get('fl', '').split(',') cdx_list = [cdx.to_json(fields) for cdx in cdx_list] return iter(cdx_list)
def get_html(self, url, closest_datetime_str=None): canonical = canonicalize(url) metas = self.meta_index.get(canonical) if not metas: return None metas = [json.loads(m) for m in metas] metas = [m for m in metas if m['status'] == "200"] if len(metas) > 1 and closest_datetime_str and sort_by_closest( metas, parse_date(closest_datetime_str)): pass # successfully sorted metas by reference time else: metas = sorted(metas, key=lambda m: m['filename'], reverse=True) html = get_first_or_none( map(CommonCrawlS3.fetch_html_from_s3_file, metas)) return html
def to_key(self, url_or_surt, exact_match=False): """ If 'url_or_surt' already a SURT, use as is If exact match, add the exact match suffix :param str url_or_surt: The url or surt to be converted to an acl key :param bool exact_match: Should the exact match suffix be added to key :rtype: str """ if self.SURT_RX.search(url_or_surt): result = url_or_surt else: result = canonicalize(url_or_surt) if exact_match: result += AccessChecker.EXACT_SUFFIX return result
def convert_to_cdxj(self): cdxj_writer = CDXJ() for filename in self.iter_cdx_files(): outfile = filename + 'j' print('Converting {0} -> {1}'.format(filename, outfile)) with open(outfile + '.tmp', 'w+b') as out: with open(filename) as fh: for line in fh: if line.startswith(' CDX'): continue cdx = CDXObject(line) cdx[URLKEY] = canonicalize(cdx[ORIGINAL]) cdxj_writer.write_cdx_line(out, cdx, cdx['filename']) shutil.move(outfile + '.tmp', outfile) os.remove(filename)
def convert_to_cdx(self, item, urlkey, url): cdx = CDXObject() cdx['timestamp'] = gettext(item, 'tstamp')[:14] for elem in item.iter(): if(elem.tag == "source"): url = elem.attrib['url'] cdx['url'] = url cdx['urlkey'] = canonicalize(url) #print("URL?" + etree.tostring(item)) cdx['mime'] = gettext(item, 'primaryType') + '/' + gettext(item, 'subType') cdx['status'] = '-' cdx['digest'] = gettext(item, 'digest') #cdx['length'] = gettext(item, 'contentLength') cdx['length'] = '-' cdx['offset'] = gettext(item, 'arcoffset') cdx['filename'] = gettext(item, 'arcname') + '.arc.gz' return cdx
def transform_cdx(cdx_path): with open(cdx_path, mode="r") as cdxfile: for line in cdxfile: record_dict = defaultdict() record_list = line.split(' ') # build dict record_dict['url'] = record_list[2] record_dict['mime'] = record_list[3] record_dict['status'] = record_list[4] record_dict['digest'] = record_list[5] record_dict['length'] = '0' record_dict['offset'] = record_list[7] record_dict['filename'] = record_list[8].replace('\n', '') try: print "{} {} {}".format( canonicalize(record_list[0], surt_ordered=True), record_list[1], json.dumps(record_dict)) except ValueError as e: print "Header"
def merge_request_data(self, other, options): surt_ordered = options.get('surt_ordered', True) if other.record.rec_type != 'request': return False # two requests, not correct if self.record.rec_type == 'request': return False # merge POST/PUT body query if hasattr(other, 'post_query'): url = append_post_query(self.url, other.post_query) self.key = canonicalize(url, surt_ordered) other.key = self.key referer = other.record.status_headers.get_header('referer') if referer: self.referer = referer return True
def links_to_cdxobject(self, link_header, def_name): results = MementoUtils.parse_links(link_header, def_name) original = results['original']['url'] key = canonicalize(original) mementos = results['mementos'] for val in mementos: dt = val['datetime'] ts = http_date_to_timestamp(dt) cdx = CDXObject() cdx['urlkey'] = key cdx['timestamp'] = ts cdx['url'] = original cdx['mem_rel'] = val.get('rel', '') cdx['memento_url'] = val['url'] load_url = self._get_replay_url(cdx['timestamp'], original) cdx['load_url'] = load_url yield cdx
def merge_request_data(self, other, options): surt_ordered = options.get('surt_ordered', True) if other.record.rec_type != 'request': return False # two requests, not correct if self.record.rec_type == 'request': return False # merge POST/PUT body query post_query = other.get('_post_query') if post_query: url = append_post_query(self['url'], post_query) self['urlkey'] = canonicalize(url, surt_ordered) other['urlkey'] = self['urlkey'] referer = other.record.status_headers.get_header('referer') if referer: self['_referer'] = referer return True
def raise_on_self_redirect(self, params, cdx, status_code, location_url): """ Check if response is a 3xx redirect to the same url If so, reject this capture to avoid causing redirect loop """ if cdx.get('is_live'): return if not status_code.startswith('3') or status_code == '304': return request_url = params['url'].lower() if not location_url: return location_url = location_url.lower() if location_url.startswith('/'): host = urlsplit(cdx['url']).netloc location_url = host + location_url location_url = location_url.split('://', 1)[-1].rstrip('/') request_url = request_url.split('://', 1)[-1].rstrip('/') self_redir = False if request_url == location_url: self_redir = True elif params.get('sr-urlkey'): # if new location canonicalized matches old key, also self-redirect if canonicalize(location_url) == params.get('sr-urlkey'): self_redir = True if self_redir: msg = 'Self Redirect {0} -> {1}' msg = msg.format(request_url, location_url) params['sr-urlkey'] = cdx['urlkey'] raise LiveResourceException(msg)
def raise_on_self_redirect(self, params, cdx, status_code, location_url): """ Check if response is a 3xx redirect to the same url If so, reject this capture to avoid causing redirect loop """ if cdx.get('is_live'): return if not status_code.startswith('3') or status_code == '304': return request_url = params['url'].lower() if not location_url: return location_url = location_url.lower() if location_url.startswith('/'): host = urlsplit(cdx['url']).netloc location_url = host + location_url location_url = location_url.split('://', 1)[-1].rstrip('/') request_url = request_url.split('://', 1)[-1].rstrip('/') self_redir = False orig_key = params.get('sr-urlkey') or cdx['urlkey'] if request_url == location_url: self_redir = True # if new location canonicalized matches old key, also self-redirect elif canonicalize(location_url) == orig_key: self_redir = True if self_redir: msg = 'Self Redirect {0} -> {1}' msg = msg.format(request_url, location_url) params['sr-urlkey'] = orig_key raise LiveResourceException(msg)
def merge_request_data(self, other, options): surt_ordered = options.get('surt_ordered', True) if other.record.rec_type != 'request': return False # two requests, not correct if self.record.rec_type == 'request': return False # merge POST/PUT body query post_query = other.get('_post_query') url = self['url'] new_url = post_query.append_query(url) new_url = new_url.replace('WB_wombat_', '') if post_query and new_url != url: self['urlkey'] = canonicalize(new_url, surt_ordered) other['urlkey'] = self['urlkey'] referer = other.record.http_headers.get_header('referer') if referer: self['_referer'] = referer return True
def create_record_iter(self, arcv_iter): append_post = self.options.get('append_post') include_all = self.options.get('include_all') block_size = self.options.get('block_size', 16384) surt_ordered = self.options.get('surt_ordered', True) minimal = self.options.get('minimal') if append_post and minimal: raise Exception('Sorry, minimal index option and ' + 'append POST options can not be used together') for record in arcv_iter.iter_records(block_size): entry = None if not include_all and not minimal and (record.status_headers.get_statuscode() == '-'): continue if record.format == 'warc': if (record.rec_type in ('request', 'warcinfo') and not include_all and not append_post): continue elif (not include_all and record.content_type == 'application/warc-fields'): continue entry = self.parse_warc_record(record) elif record.format == 'arc': entry = self.parse_arc_record(record) if not entry: continue if entry.get('url') and not entry.get('urlkey'): entry['urlkey'] = canonicalize(entry['url'], surt_ordered) compute_digest = False if (entry.get('digest', '-') == '-' and record.rec_type not in ('revisit', 'request', 'warcinfo')): compute_digest = True elif not minimal and record.rec_type == 'request' and append_post: method = record.status_headers.protocol len_ = record.status_headers.get_header('Content-Length') post_query = extract_post_query(method, entry.get('mime'), len_, record.stream) entry['_post_query'] = post_query entry.record = record self.begin_payload(compute_digest, entry) arcv_iter.read_to_end(record, self.handle_payload) entry.set_rec_info(*arcv_iter.member_info) self.end_payload(entry) yield entry
def create_record_iter(self, raw_iter): append_post = self.options.get('append_post') include_all = self.options.get('include_all') surt_ordered = self.options.get('surt_ordered', True) minimal = self.options.get('minimal') if append_post and minimal: raise Exception('Sorry, minimal index option and ' + 'append POST options can not be used together') for record in raw_iter: entry = None if not include_all and not minimal and (record.http_headers.get_statuscode() == '-'): continue if record.rec_type == 'arc_header': continue if record.format == 'warc': if (record.rec_type in ('request', 'warcinfo') and not include_all and not append_post): continue elif (not include_all and record.content_type == 'application/warc-fields'): continue entry = self.parse_warc_record(record) elif record.format == 'arc': entry = self.parse_arc_record(record) if not entry: continue if entry.get('url') and not entry.get('urlkey'): entry['urlkey'] = canonicalize(entry['url'], surt_ordered) compute_digest = False if (entry.get('digest', '-') == '-' and record.rec_type not in ('revisit', 'request', 'warcinfo')): compute_digest = True elif not minimal and record.rec_type == 'request' and append_post: method = record.http_headers.protocol len_ = record.http_headers.get_header('Content-Length') post_query = MethodQueryCanonicalizer(method, entry.get('_content_type'), len_, record.raw_stream) entry['_post_query'] = post_query entry.record = record self.begin_payload(compute_digest, entry) while True: buff = record.raw_stream.read(BUFF_SIZE) if not buff: break self.handle_payload(buff) raw_iter.read_to_end(record) entry.set_rec_info(*raw_iter.member_info) self.end_payload(entry) yield entry
def test_memento_to_cdx(url, mem): key = canonicalize(url) for ts, target in mem: yield key + ' ' + ts + ' ' + url + ' ' + target
def get_url_key_p(ts, url): key = ts + '/' + canonicalize(url, False) if not url.endswith('/'): key += '/' return key
def process_record(self, record, flow): headers = flow.response.headers url = flow.request.req_url scheme = flow.request.req_scheme if not self.content_rewriter: return record.http_headers, StreamIO(record.raw_stream) cookie_rewriter = None template_params = flow.extra_data environ = { 'pywb_proxy_magic': self.proxy_magic, 'webrec.template_params': template_params } wb_url = WbUrl(url) wb_prefix = '' host_prefix = flow.request.req_scheme + '://' + self.proxy_magic urlrewriter = SchemeOnlyUrlRewriter(wb_url, '') if flow.request.headers.get('X-Requested-With', '').lower() == 'xmlhttprequest': urlrewriter.rewrite_opts['is_ajax'] = True head_insert_func = (self.head_insert_view.create_insert_func( wb_url, wb_prefix, host_prefix, url, environ, False)) urlkey = canonicalize(wb_url.url) cdx = CDXObject() cdx['urlkey'] = urlkey cdx['timestamp'] = http_date_to_timestamp( headers.get('Memento-Datetime')) cdx['url'] = wb_url.url if headers.get('Webagg-Source-Coll') == 'live': cdx['is_live'] = 'true' result = self.content_rewriter.rewrite_content( urlrewriter, record.http_headers, record.raw_stream, head_insert_func, urlkey, cdx, cookie_rewriter, environ) status_headers, gen, is_rw = result status_headers.remove_header('Content-Security-Policy') # check for content-length res = status_headers.get_header('content-length') try: if int(res) > 0: return status_headers, IterIdent(gen) except: pass # need to either chunk or buffer to get content-length if flow.request.http_version == 'HTTP/1.1': status_headers.remove_header('content-length') status_headers.headers.append(('Transfer-Encoding', 'chunked')) #gen = chunk_encode_iter(gen) else: gen = buffer_iter(status_headers, gen) return status_headers, IterIdent(gen)
def get_url_key_p(ts, url): key = ts + "/" + canonicalize(url, False) if not url.endswith("/"): key += "/" return key
def create_record_iter(self, raw_iter): append_post = self.options.get('append_post') include_all = self.options.get('include_all') surt_ordered = self.options.get('surt_ordered', True) minimal = self.options.get('minimal') if append_post and minimal: raise Exception('Sorry, minimal index option and ' + 'append POST options can not be used together') for record in raw_iter: entry = None if not include_all and not minimal and ( record.http_headers.get_statuscode() == '-'): continue if record.rec_type == 'arc_header': continue if record.format == 'warc': if (record.rec_type in ('request', 'warcinfo') and not include_all and not append_post): continue elif (not include_all and record.content_type == 'application/warc-fields'): continue entry = self.parse_warc_record(record) elif record.format == 'arc': entry = self.parse_arc_record(record) if not entry: continue if entry.get('url') and not entry.get('urlkey'): entry['urlkey'] = canonicalize(entry['url'], surt_ordered) compute_digest = False if (entry.get('digest', '-') == '-' and record.rec_type not in ('revisit', 'request', 'warcinfo')): compute_digest = True elif not minimal and record.rec_type == 'request' and append_post: method = record.http_headers.protocol len_ = record.http_headers.get_header('Content-Length') post_query = MethodQueryCanonicalizer( method, entry.get('_content_type'), len_, record.raw_stream) entry['_post_query'] = post_query entry.record = record self.begin_payload(compute_digest, entry) while True: buff = record.raw_stream.read(BUFF_SIZE) if not buff: break self.handle_payload(buff) raw_iter.read_to_end(record) entry.set_rec_info(*raw_iter.member_info) self.end_payload(entry) yield entry
def render_content(self, wb_url, kwargs, environ): wb_url = WbUrl(wb_url) host_prefix = self.get_host_prefix(environ) rel_prefix = self.get_rel_prefix(environ) full_prefix = host_prefix + rel_prefix resp = self.handle_custom_response(environ, wb_url, full_prefix, host_prefix, kwargs) if resp is not None: content_type = 'text/html' # if not replay outer frame, specify utf-8 charset if not self.is_framed_replay(wb_url): content_type += '; charset=utf-8' return WbResponse.text_response(resp, content_type=content_type) urlrewriter = UrlRewriter(wb_url, prefix=full_prefix, full_prefix=full_prefix, rel_prefix=rel_prefix) self.unrewrite_referrer(environ) urlkey = canonicalize(wb_url.url) inputreq = RewriteInputRequest(environ, urlkey, wb_url.url, self.content_rewriter) inputreq.include_post_query(wb_url.url) mod_url = None use_206 = False rangeres = None readd_range = False async_record_url = None if kwargs.get('type') in ('record', 'patch'): rangeres = inputreq.extract_range() if rangeres: mod_url, start, end, use_206 = rangeres # if bytes=0- Range request, # simply remove the range and still proxy if start == 0 and not end and use_206: wb_url.url = mod_url inputreq.url = mod_url del environ['HTTP_RANGE'] readd_range = True else: async_record_url = mod_url skip = async_record_url is not None setcookie_headers = None if self.cookie_tracker: cookie_key = self.get_cookie_key(kwargs) res = self.cookie_tracker.get_cookie_headers(wb_url.url, cookie_key) inputreq.extra_cookie, setcookie_headers = res r = self._do_req(inputreq, wb_url, kwargs, skip) if r.status_code >= 400: error = None try: error = r.raw.read() r.raw.close() except: pass if error: error = error.decode('utf-8') else: error = '' details = dict(args=kwargs, error=error) raise UpstreamException(r.status_code, url=wb_url.url, details=details) if async_record_url: environ.pop('HTTP_RANGE', '') gevent.spawn(self._do_async_req, inputreq, async_record_url, wb_url, kwargs, False) record = self.loader.parse_record_stream(r.raw) cdx = CDXObject() cdx['urlkey'] = urlkey cdx['timestamp'] = http_date_to_timestamp(r.headers.get('Memento-Datetime')) cdx['url'] = wb_url.url self._add_custom_params(cdx, r.headers, kwargs) if readd_range: content_length = (record.status_headers. get_header('Content-Length')) try: content_length = int(content_length) record.status_headers.add_range(0, content_length, content_length) except (ValueError, TypeError): pass if self.is_ajax(environ): head_insert_func = None urlrewriter.rewrite_opts['is_ajax'] = True else: top_url = self.get_top_url(full_prefix, wb_url, cdx, kwargs) head_insert_func = (self.head_insert_view. create_insert_func(wb_url, full_prefix, host_prefix, top_url, environ, self.framed_replay)) cookie_rewriter = None if self.cookie_tracker: cookie_rewriter = self.cookie_tracker.get_rewriter(urlrewriter, cookie_key) result = self.content_rewriter.rewrite_content(urlrewriter, record.status_headers, record.stream, head_insert_func, urlkey, cdx, cookie_rewriter, environ) status_headers, gen, is_rw = result if setcookie_headers: status_headers.headers.extend(setcookie_headers) return WbResponse(status_headers, gen)
def render_content(self, wb_url, kwargs, environ): wb_url = wb_url.replace('#', '%23') wb_url = WbUrl(wb_url) is_timegate = self._check_accept_dt(wb_url, environ) host_prefix = self.get_host_prefix(environ) rel_prefix = self.get_rel_prefix(environ) full_prefix = host_prefix + rel_prefix is_proxy = ('wsgiprox.proxy_host' in environ) response = self.handle_custom_response(environ, wb_url, full_prefix, host_prefix, kwargs) if response: return self.format_response(response, wb_url, full_prefix, is_timegate, is_proxy) if is_proxy: environ['pywb_proxy_magic'] = environ['wsgiprox.proxy_host'] urlrewriter = IdentityUrlRewriter(wb_url, '') framed_replay = False else: urlrewriter = UrlRewriter(wb_url, prefix=full_prefix, full_prefix=full_prefix, rel_prefix=rel_prefix) framed_replay = self.framed_replay url_parts = urlsplit(wb_url.url) if not url_parts.path: scheme, netloc, path, query, frag = url_parts path = '/' url = urlunsplit((scheme, netloc, path, query, frag)) resp = WbResponse.redir_response(urlrewriter.rewrite(url), '307 Temporary Redirect') if self.enable_memento: resp.status_headers['Link'] = MementoUtils.make_link(url, 'original') return resp self.unrewrite_referrer(environ, full_prefix) urlkey = canonicalize(wb_url.url) if self.use_js_obj_proxy: content_rw = self.js_proxy_rw else: content_rw = self.default_rw inputreq = RewriteInputRequest(environ, urlkey, wb_url.url, content_rw) inputreq.include_method_query(wb_url.url) range_start, range_end, skip_record = self._check_range(inputreq, wb_url) setcookie_headers = None if self.cookie_tracker: cookie_key = self.get_cookie_key(kwargs) res = self.cookie_tracker.get_cookie_headers(wb_url.url, urlrewriter, cookie_key) inputreq.extra_cookie, setcookie_headers = res r = self._do_req(inputreq, wb_url, kwargs, skip_record) if r.status_code >= 400: error = None try: error = r.raw.read() r.raw.close() except: pass if error: error = error.decode('utf-8') else: error = '' details = dict(args=kwargs, error=error) raise UpstreamException(r.status_code, url=wb_url.url, details=details) stream = BufferedReader(r.raw, block_size=BUFF_SIZE) record = self.loader.parse_record_stream(stream, ensure_http_headers=True) memento_dt = r.headers.get('Memento-Datetime') target_uri = r.headers.get('WARC-Target-URI') cdx = CDXObject(r.headers.get('Warcserver-Cdx').encode('utf-8')) #cdx['urlkey'] = urlkey #cdx['timestamp'] = http_date_to_timestamp(memento_dt) #cdx['url'] = target_uri set_content_loc = False # Check if Fuzzy Match if target_uri != wb_url.url and cdx.get('is_fuzzy') == '1': set_content_loc = True # if redir to exact, redir if url or ts are different if self.redirect_to_exact: if (set_content_loc or (wb_url.timestamp != cdx.get('timestamp') and not cdx.get('is_live'))): new_url = urlrewriter.get_new_url(url=target_uri, timestamp=cdx['timestamp'], mod=wb_url.mod) resp = WbResponse.redir_response(new_url, '307 Temporary Redirect') if self.enable_memento: if is_timegate and not is_proxy: self._add_memento_links(target_uri, full_prefix, memento_dt, cdx['timestamp'], resp.status_headers, is_timegate, is_proxy) else: resp.status_headers['Link'] = MementoUtils.make_link(target_uri, 'original') return resp self._add_custom_params(cdx, r.headers, kwargs) if self._add_range(record, wb_url, range_start, range_end): wb_url.mod = 'id_' is_ajax = self.is_ajax(environ) if is_ajax: head_insert_func = None urlrewriter.rewrite_opts['is_ajax'] = True else: top_url = self.get_top_url(full_prefix, wb_url, cdx, kwargs) head_insert_func = (self.head_insert_view. create_insert_func(wb_url, full_prefix, host_prefix, top_url, environ, framed_replay, config=self.config)) cookie_rewriter = None if self.cookie_tracker: cookie_rewriter = self.cookie_tracker.get_rewriter(urlrewriter, cookie_key) urlrewriter.rewrite_opts['ua_string'] = environ.get('HTTP_USER_AGENT') result = content_rw(record, urlrewriter, cookie_rewriter, head_insert_func, cdx) status_headers, gen, is_rw = result if setcookie_headers: status_headers.headers.extend(setcookie_headers) if ' ' not in status_headers.statusline: status_headers.statusline += ' None' if not is_ajax and self.enable_memento: self._add_memento_links(cdx['url'], full_prefix, memento_dt, cdx['timestamp'], status_headers, is_timegate, is_proxy, cdx.get('source-coll')) set_content_loc = True if set_content_loc and not self.redirect_to_exact: status_headers.headers.append(('Content-Location', urlrewriter.get_new_url(timestamp=cdx['timestamp'], url=cdx['url']))) if not is_proxy: self.add_csp_header(wb_url, status_headers) response = WbResponse(status_headers, gen) return response
def render_content(self, wb_url, kwargs, environ): wb_url = wb_url.replace('#', '%23') wb_url = WbUrl(wb_url) history_page = environ.pop('HTTP_X_WOMBAT_HISTORY_PAGE', '') if history_page: wb_url.url = history_page is_ajax = True else: is_ajax = self.is_ajax(environ) is_timegate = self._check_accept_dt(wb_url, environ) self.prepare_env(environ) host_prefix = environ['pywb.host_prefix'] rel_prefix = self.get_rel_prefix(environ) full_prefix = host_prefix + rel_prefix pywb_static_prefix = environ['pywb.static_prefix'] + '/' is_proxy = ('wsgiprox.proxy_host' in environ) # if OPTIONS in proxy mode, just generate the proxy responss if is_proxy and self.is_preflight(environ): return WbResponse.options_response(environ) if self.use_js_obj_proxy: content_rw = self.js_proxy_rw else: content_rw = self.default_rw # no redirects if in proxy redirect_to_exact = self.redirect_to_exact and not is_proxy # Check Prefer pref_mod, pref_applied = self._get_prefer_mod(wb_url, environ, content_rw, is_proxy) response = None keep_frame_response = False # prefer overrides custom response? if pref_mod is not None: # fast-redirect to preferred if redirect_to_exact and not is_timegate and pref_mod != wb_url.mod: new_url = full_prefix + wb_url.to_str(mod=pref_mod) headers = [('Preference-Applied', pref_applied), ('Vary', 'Prefer')] return WbResponse.redir_response(new_url, '307 Temporary Redirect', headers=headers) else: wb_url.mod = pref_mod else: if kwargs.get('output'): response = self.handle_timemap(wb_url, kwargs, full_prefix) elif wb_url.is_query(): response = self.handle_query(environ, wb_url, kwargs, full_prefix) else: response = self.handle_custom_response(environ, wb_url, full_prefix, host_prefix, kwargs) keep_frame_response = (not kwargs.get('no_timegate_check') and is_timegate and not is_proxy) or redirect_to_exact if response and not keep_frame_response: return self.format_response(response, wb_url, full_prefix, is_timegate, is_proxy) if is_proxy: environ['pywb_proxy_magic'] = environ['wsgiprox.proxy_host'] urlrewriter = IdentityUrlRewriter(wb_url, '') framed_replay = False else: urlrewriter = UrlRewriter(wb_url, prefix=full_prefix, full_prefix=full_prefix, rel_prefix=rel_prefix, pywb_static_prefix=pywb_static_prefix) framed_replay = self.framed_replay url_parts = urlsplit(wb_url.url) if not url_parts.path: return self.send_redirect('/', url_parts, urlrewriter) self.unrewrite_referrer(environ, full_prefix) urlkey = canonicalize(wb_url.url) inputreq = RewriteInputRequest(environ, urlkey, wb_url.url, content_rw) inputreq.include_method_query(wb_url.url) range_start, range_end, skip_record = self._check_range( inputreq, wb_url) setcookie_headers = None cookie_key = None if self.cookie_tracker: cookie_key = self.get_cookie_key(kwargs) if cookie_key: res = self.cookie_tracker.get_cookie_headers( wb_url.url, urlrewriter, cookie_key, environ.get('HTTP_COOKIE', '')) inputreq.extra_cookie, setcookie_headers = res r = self._do_req(inputreq, wb_url, kwargs, skip_record) if r.status_code >= 400: error = None try: error = r.raw.read() except Exception: pass finally: no_except_close(r.raw) if error: error = error.decode('utf-8') else: error = '' details = dict(args=kwargs, error=error) if r.status_code == 404: raise NotFoundException(url=wb_url.url, msg=details) else: raise UpstreamException(r.status_code, url=wb_url.url, details=details) cdx = CDXObject(r.headers.get('Warcserver-Cdx').encode('utf-8')) cdx_url_parts = urlsplit(cdx['url']) if cdx_url_parts.path.endswith( '/') and not url_parts.path.endswith('/'): # add trailing slash new_path = url_parts.path + '/' no_except_close(r.raw) return self.send_redirect(new_path, url_parts, urlrewriter) # only redirect to exact if not live, otherwise set to false redirect_to_exact = redirect_to_exact and not cdx.get('is_live') # return top-frame timegate response, with timestamp from cdx if response and keep_frame_response and (not redirect_to_exact or not is_timegate): no_except_close(r.raw) return self.format_response(response, wb_url, full_prefix, is_timegate, is_proxy, cdx['timestamp']) stream = BufferedReader(r.raw, block_size=BUFF_SIZE) record = self.loader.parse_record_stream(stream, ensure_http_headers=True) memento_dt = r.headers.get('Memento-Datetime') target_uri = r.headers.get('WARC-Target-URI') # cdx['urlkey'] = urlkey # cdx['timestamp'] = http_date_to_timestamp(memento_dt) # cdx['url'] = target_uri set_content_loc = False # Check if Fuzzy Match if target_uri != wb_url.url and cdx.get('is_fuzzy') == '1': set_content_loc = True # if redirect to exact timestamp (only set if not live) if redirect_to_exact: if set_content_loc or is_timegate or wb_url.timestamp != cdx.get( 'timestamp'): new_url = urlrewriter.get_new_url(url=target_uri, timestamp=cdx['timestamp'], mod=wb_url.mod) resp = WbResponse.redir_response(new_url, '307 Temporary Redirect') if self.enable_memento: if is_timegate and not is_proxy: self._add_memento_links(target_uri, full_prefix, memento_dt, cdx['timestamp'], resp.status_headers, is_timegate, is_proxy, pref_applied=pref_applied, mod=pref_mod, is_memento=False) else: resp.status_headers['Link'] = MementoUtils.make_link( target_uri, 'original') return resp self._add_custom_params(cdx, r.headers, kwargs, record) if self._add_range(record, wb_url, range_start, range_end): wb_url.mod = 'id_' if is_ajax: head_insert_func = None urlrewriter.rewrite_opts['is_ajax'] = True else: top_url = self.get_top_url(full_prefix, wb_url, cdx, kwargs) head_insert_func = (self.head_insert_view.create_insert_func( wb_url, full_prefix, host_prefix, top_url, environ, framed_replay, coll=kwargs.get('coll', ''), replay_mod=self.replay_mod, metadata=kwargs.get('metadata', {}), config=self.config)) cookie_rewriter = None if self.cookie_tracker and cookie_key: # skip add cookie if service worker is not 200 # it seems cookie headers from service workers are not applied, so don't update in cache if wb_url.mod == 'sw_': cookie_key = None cookie_rewriter = self.cookie_tracker.get_rewriter( urlrewriter, cookie_key) urlrewriter.rewrite_opts['ua_string'] = environ.get('HTTP_USER_AGENT') result = content_rw(record, urlrewriter, cookie_rewriter, head_insert_func, cdx, environ) status_headers, gen, is_rw = result if history_page: title = DefaultRewriter._extract_title(gen) if not title: title = unquote(environ.get('HTTP_X_WOMBAT_HISTORY_TITLE', '')) if not title: title = history_page self._add_history_page(cdx, kwargs, title) return WbResponse.json_response({'title': title}) if setcookie_headers: status_headers.headers.extend(setcookie_headers) if ' ' not in status_headers.statusline: status_headers.statusline += ' None' if not is_ajax and self.enable_memento: self._add_memento_links(cdx['url'], full_prefix, memento_dt, cdx['timestamp'], status_headers, is_timegate, is_proxy, cdx.get('source-coll'), mod=pref_mod, pref_applied=pref_applied) set_content_loc = True if set_content_loc and not redirect_to_exact and not is_proxy: status_headers.headers.append( ('Content-Location', urlrewriter.get_new_url(timestamp=cdx['timestamp'], url=cdx['url']))) if not is_proxy: self.add_csp_header(wb_url, status_headers) response = WbResponse(status_headers, gen) if is_proxy and environ.get('HTTP_ORIGIN'): response.add_access_control_headers(environ) if r.status_code == 200 and kwargs.get( 'cache') == 'always' and environ.get('HTTP_REFERER'): response.status_headers[ 'Cache-Control'] = 'public, max-age=31536000, immutable' return response