def lookup_revisit(self, lookup_params, digest, url, iso_dt): params = {} for param in lookup_params: if param.startswith('param.'): params[param] = lookup_params[param] params['url'] = url params['closest'] = iso_date_to_timestamp(iso_dt) filters = [] filters.append('!mime:warc/revisit') if digest and digest != '-': filters.append('digest:' + digest.split(':')[-1]) params['filter'] = filters cdx_iter, errs = self.cdx_lookup(params) for cdx in cdx_iter: res = self.dupe_policy(cdx, params) if res: return res return None
def extract_text(self, record): url = record.rec_headers.get('WARC-Target-URI') date = record.rec_headers.get('WARC-Date') id_ = iso_date_to_timestamp(date) + '/' + url if id_ not in self.pages: return mime = self.get_record_mime_type(record) if mime not in HTML_MIME_TYPES: return if record.http_headers and record.http_headers.get_statuscode().startswith('3'): return extractor = extractors.ArticleExtractor() content = record.content_stream().read() try: content = content.decode("utf-8") doc = extractor.get_doc(content) if doc.content: self.pages[id_]["text"] = doc.content if doc.title: self.pages[id_]["title"] = doc.title except # skip text extraction in case of errors pass
def lookup_revisit(self, lookup_params, digest, url, iso_dt): params = {} for param in lookup_params: if param.startswith('param.'): params[param] = lookup_params[param] params['url'] = url params['closest'] = iso_date_to_timestamp(iso_dt) filters = [] filters.append('!mime:warc/revisit') if digest and digest != '-': filters.append('digest:' + digest.split(':')[-1]) params['filter'] = filters cdx_iter, errs = self.cdx_lookup(params) for cdx in cdx_iter: res = self.dupe_policy(cdx, params) if res: return res return None
def construct_passed_pages_dict(passed_content): """Creates a dictionary of the passed pages with the url as the key or ts/url if ts is present and the title and text as the values if they have been passed""" passed_pages_dict = {} for i in range(0, len(passed_content)): # Skip the file's header if it's been set header = json.loads(passed_content[i]) if "format" not in header: pages_dict = dict(header) # Set the default key as url key = "%s" % pages_dict["url"] # If timestamp is present overwrite the key to be 'ts/url' if "ts" in pages_dict: key = "%s/%s" % ( iso_date_to_timestamp(pages_dict["ts"]), pages_dict["url"], ) # Add the key to the dictionary with a blank value passed_pages_dict[key] = {} # If title was in the passed pages line add it to the value of the just created dictionary entry if "title" in pages_dict: passed_pages_dict[key]["title"] = pages_dict["title"] # If text was in the passed pages line add it to the value of the just created dictionary entry if "text" in pages_dict: passed_pages_dict[key]["text"] = pages_dict["text"] return passed_pages_dict
def _load_different_url_payload(self, cdx, headers_record, failed_files, cdx_loader): """ Handle the case where a duplicate of a capture with same digest exists at a different url. If a cdx_server is provided, a query is made for matching url, timestamp and digest. Raise exception if no matches found. """ digest = cdx.get('digest', '-') # if the digest is the empty record digest, don't attempt to look up the payload record! # the payload is simply empty, so use empty payload of existing record if digest == self.EMPTY_DIGEST: return headers_record ref_target_uri = ( headers_record.rec_headers.get_header('WARC-Refers-To-Target-URI')) target_uri = headers_record.rec_headers.get_header('WARC-Target-URI') # if no target uri, no way to find the original if not ref_target_uri: raise ArchiveLoadFailed(self.MISSING_REVISIT_MSG) ref_target_date = ( headers_record.rec_headers.get_header('WARC-Refers-To-Date')) if not ref_target_date: ref_target_date = cdx['timestamp'] else: ref_target_date = iso_date_to_timestamp(ref_target_date) try: orig_cdx_lines = self.load_cdx_for_dupe(ref_target_uri, ref_target_date, digest, cdx_loader) except NotFoundException: raise ArchiveLoadFailed(self.MISSING_REVISIT_MSG) for orig_cdx in orig_cdx_lines: try: payload_record = self._resolve_path_load( orig_cdx, False, failed_files) return payload_record except ArchiveLoadFailed as e: pass raise ArchiveLoadFailed(self.MISSING_REVISIT_MSG)
def parse_warc_record(self, record): """ Parse warc record """ entry = self._create_index_entry(record.rec_type) if record.rec_type == 'warcinfo': entry['url'] = record.rec_headers.get_header('WARC-Filename') entry['urlkey'] = entry['url'] entry['_warcinfo'] = record.raw_stream.read(record.length) return entry entry['url'] = record.rec_headers.get_header('WARC-Target-Uri') # timestamp entry['timestamp'] = iso_date_to_timestamp(record.rec_headers. get_header('WARC-Date')) # mime if record.rec_type == 'revisit': entry['mime'] = 'warc/revisit' elif self.options.get('minimal'): entry['mime'] = '-' else: def_mime = '-' if record.rec_type == 'request' else 'unk' entry.extract_mime(record.http_headers. get_header('Content-Type'), def_mime) # detected mime from WARC-Identified-Payload-Type entry['mime-detected'] = record.rec_headers.get_header( 'WARC-Identified-Payload-Type') # status -- only for response records (by convention): if record.rec_type == 'response' and not self.options.get('minimal'): entry.extract_status(record.http_headers) else: entry['status'] = '-' # digest digest = record.rec_headers.get_header('WARC-Payload-Digest') entry['digest'] = digest if digest and digest.startswith('sha1:'): entry['digest'] = digest[len('sha1:'):] elif not entry.get('digest'): entry['digest'] = '-' # optional json metadata, if present metadata = record.rec_headers.get_header('WARC-Json-Metadata') if metadata: entry['metadata'] = metadata return entry
def init_proxy(self, config): """Initialize and start proxy mode. If proxy configuration entry is not contained in the config this is a no op. Causes handler to become an instance of WSGIProxMiddleware. :param dict config: The configuration object used to configure this instance of FrontEndApp """ proxy_config = config.get('proxy') if not proxy_config: return if isinstance(proxy_config, str): proxy_coll = proxy_config proxy_config = {} else: proxy_coll = proxy_config['coll'] if '/' in proxy_coll: raise Exception('Proxy collection can not contain "/"') proxy_config['ca_name'] = proxy_config.get('ca_name', self.PROXY_CA_NAME) proxy_config['ca_file_cache'] = proxy_config.get('ca_file_cache', self.PROXY_CA_PATH) if proxy_config.get('recording'): logging.info('Proxy recording into collection "{0}"'.format(proxy_coll)) if proxy_coll in self.warcserver.list_fixed_routes(): raise Exception('Can not record into fixed collection') proxy_coll += self.RECORD_ROUTE if not config.get('recorder'): config['recorder'] = 'live' else: logging.info('Proxy enabled for collection "{0}"'.format(proxy_coll)) if proxy_config.get('enable_content_rewrite', True): self.proxy_prefix = '/{0}/bn_/'.format(proxy_coll) else: self.proxy_prefix = '/{0}/id_/'.format(proxy_coll) self.proxy_default_timestamp = proxy_config.get('default_timestamp') if self.proxy_default_timestamp: if not self.ALL_DIGITS.match(self.proxy_default_timestamp): try: self.proxy_default_timestamp = iso_date_to_timestamp(self.proxy_default_timestamp) except: raise Exception('Invalid Proxy Timestamp: Must Be All-Digit Timestamp or ISO Date Format') self.proxy_coll = proxy_coll self.handler = WSGIProxMiddleware(self.handle_request, self.proxy_route_request, proxy_host=proxy_config.get('host', 'pywb.proxy'), proxy_options=proxy_config)
def parse_warc_record(self, record): """ Parse warc record """ entry = self._create_index_entry(record.rec_type) if record.rec_type == 'warcinfo': entry['url'] = record.rec_headers.get_header('WARC-Filename') entry['urlkey'] = entry['url'] entry['_warcinfo'] = record.raw_stream.read(record.length) return entry entry['url'] = record.rec_headers.get_header('WARC-Target-Uri') # timestamp entry['timestamp'] = iso_date_to_timestamp( record.rec_headers.get_header('WARC-Date')) # mime if record.rec_type == 'revisit': entry['mime'] = 'warc/revisit' elif self.options.get('minimal'): entry['mime'] = '-' else: def_mime = '-' if record.rec_type == 'request' else 'unk' entry.extract_mime(record.http_headers.get_header('Content-Type'), def_mime) # detected mime from WARC-Identified-Payload-Type entry['mime-detected'] = record.rec_headers.get_header( 'WARC-Identified-Payload-Type') # status -- only for response records (by convention): if record.rec_type == 'response' and not self.options.get('minimal'): entry.extract_status(record.http_headers) else: entry['status'] = '-' # digest digest = record.rec_headers.get_header('WARC-Payload-Digest') entry['digest'] = digest if digest and digest.startswith('sha1:'): entry['digest'] = digest[len('sha1:'):] elif not entry.get('digest'): entry['digest'] = '-' # optional json metadata, if present metadata = record.rec_headers.get_header('WARC-Json-Metadata') if metadata: entry['metadata'] = metadata return entry
def _load_different_url_payload(self, cdx, headers_record, failed_files, cdx_loader): """ Handle the case where a duplicate of a capture with same digest exists at a different url. If a cdx_server is provided, a query is made for matching url, timestamp and digest. Raise exception if no matches found. """ ref_target_uri = (headers_record.rec_headers. get_header('WARC-Refers-To-Target-URI')) target_uri = headers_record.rec_headers.get_header('WARC-Target-URI') # if no target uri, no way to find the original if not ref_target_uri: raise ArchiveLoadFailed(self.MISSING_REVISIT_MSG) ref_target_date = (headers_record.rec_headers. get_header('WARC-Refers-To-Date')) if not ref_target_date: ref_target_date = cdx['timestamp'] else: ref_target_date = iso_date_to_timestamp(ref_target_date) digest = cdx.get('digest', '-') try: orig_cdx_lines = self.load_cdx_for_dupe(ref_target_uri, ref_target_date, digest, cdx_loader) except NotFoundException: raise ArchiveLoadFailed(self.MISSING_REVISIT_MSG) for orig_cdx in orig_cdx_lines: try: payload_record = self._resolve_path_load(orig_cdx, False, failed_files) return payload_record except ArchiveLoadFailed as e: pass raise ArchiveLoadFailed(self.MISSING_REVISIT_MSG)
def _write_line(self, out, index, record, filename): url = index.get("url") if not url: url = record.rec_headers.get("WARC-Target-URI") dt = record.rec_headers.get("WARC-Date") ts = iso_date_to_timestamp(dt) if hasattr(record, "urlkey"): urlkey = record.urlkey else: urlkey = self.get_url_key(url) self._do_write(urlkey, ts, index, out)
def _write_line( self, out, index, record, filename ): # Possibility 1: add custom staff here through the record, raw stream url = index.get('url') if not url: url = record.rec_headers.get('WARC-Target-URI') dt = record.rec_headers.get('WARC-Date') ts = iso_date_to_timestamp(dt) if hasattr(record, 'urlkey'): urlkey = record.urlkey else: urlkey = self.get_url_key(url) self._do_write(urlkey, ts, index, out)
def extract_text(self, record): url = record.rec_headers.get('WARC-Target-URI') date = record.rec_headers.get('WARC-Date') ts = iso_date_to_timestamp(date) id_ = ts + '/' + url if self.main_url and url == self.main_url: print('Found Main Url: {0}'.format(url)) self.pages[id_] = {'timestamp': ts, 'url': url, 'title': url} mime = self.get_record_mime_type(record) if mime not in HTML_MIME_TYPES: return status = record.http_headers.get_statuscode() if record.http_headers and status.startswith('3'): return if id_ not in self.pages: if self.detect_pages: self.pages[id_] = {'timestamp': ts, 'url': url, 'title': url} else: return content = self._read_record(record) if not content: return try: extractor = extractors.ArticleExtractor() content = content.decode("utf-8") doc = extractor.get_doc(content) if doc.content: self.pages[id_]["text"] = doc.content if doc.title: self.pages[id_]["title"] = doc.title except Exception as e: print(e) # skip text extraction in case of errors pass
def create_wr_metadata(self, log, rec_title): pagelist = [] for page in log['pages']: if not page['title'].startswith(('http:', 'https:')): continue pagelist.append(dict(title=page['title'], url=page['title'], timestamp=iso_date_to_timestamp(page['startedDateTime']))) metadata = {"title": rec_title, "type": "recording", } if pagelist: metadata["pages"] = pagelist return metadata
def _write_line(self, out, index, record, filename): url = index.get('url') if not url: url = record.rec_headers.get('WARC-Target-URI') dt = record.rec_headers.get('WARC-Date') ts = iso_date_to_timestamp(dt) # # TODO do my own index search here if self.current_raw_body != '': index['javascript'] = custom_index.javascript_index( self.current_raw_body) if hasattr(record, 'urlkey'): urlkey = record.urlkey else: urlkey = self.get_url_key(url) self._do_write(urlkey, ts, index, out)
def check_pages_and_text(self, record): url = record.rec_headers.get("WARC-Target-URI") date = record.rec_headers.get("WARC-Date") ts = iso_date_to_timestamp(date) id_ = ts + "/" + url matched_id = "" # Check for both a matching url/ts and url entry if id_ in self.passed_pages_dict: matched_id = id_ if url in self.passed_pages_dict: matched_id = url # If we find a match build a record if matched_id != "": self.pages[matched_id] = { "timestamp": ts, "url": url, "title": url } # Add title and text if they've been provided if "title" in self.passed_pages_dict[matched_id]: self.pages[matched_id]["title"] = self.passed_pages_dict[ matched_id]["title"] if "text" in self.passed_pages_dict[matched_id]: self.pages[matched_id]["text"] = self.passed_pages_dict[ matched_id]["text"] # Delete the entry from our pages_dict so we can't match it again del self.passed_pages_dict[matched_id] if (self.main_url and self.main_url == url and self.main_ts and self.main_ts == ts): self.main_ts_flag = True self.main_url_flag = True print("Found Main Url: {0}".format(url)) print("Found Main ts: {0}".format(ts)) # If were not relying on passed in pages we want to add all records to the self.pages object if self.passed_pages_dict == {}: self.pages[id_] = {"timestamp": ts, "url": url, "title": url} if self.main_url and self.main_url == url and self.main_ts == None: self.main_url_flag = True print("Found Main Url: {0}".format(url)) if id_ not in self.pages: self.pages[id_] = {"timestamp": ts, "url": url, "title": url} mime = self.get_record_mime_type(record) if mime not in HTML_MIME_TYPES: return status = record.http_headers.get_statuscode() if record.http_headers and status.startswith("3"): return if id_ not in self.pages: if self.detect_pages: self.pages[id_] = {"timestamp": ts, "url": url, "title": url} else: return # if not extracting text, then finish here if not self.extract_text: return content = self._read_record(record) if not content: return try: extractor = extractors.ArticleExtractor() content = content.decode("utf-8") doc = extractor.get_doc(content) curr_page = self.pages[id_] if doc.content: self.pages[id_]["text"] = doc.content self.has_text = True # only set title if unset, or set to url (default) # avoid overriding user-specified title, if any if doc.title and self.pages[id_].get("title", url) == url: self.pages[id_]["title"] = doc.title except Exception as e: # skip text extraction in case of errors print("Skipping, Text Extraction Failed For: " + url) print(e)
def init_proxy(self, config): """Initialize and start proxy mode. If proxy configuration entry is not contained in the config this is a no op. Causes handler to become an instance of WSGIProxMiddleware. :param dict config: The configuration object used to configure this instance of FrontEndApp """ proxy_config = config.get('proxy') if not proxy_config: return if isinstance(proxy_config, str): proxy_coll = proxy_config proxy_config = {} else: proxy_coll = proxy_config['coll'] if '/' in proxy_coll: raise Exception('Proxy collection can not contain "/"') proxy_config['ca_name'] = proxy_config.get('ca_name', self.PROXY_CA_NAME) proxy_config['ca_file_cache'] = proxy_config.get( 'ca_file_cache', self.PROXY_CA_PATH) if proxy_config.get('recording'): logging.info( 'Proxy recording into collection "{0}"'.format(proxy_coll)) if proxy_coll in self.warcserver.list_fixed_routes(): raise Exception('Can not record into fixed collection') proxy_route = proxy_coll + self.RECORD_ROUTE if not config.get('recorder'): config['recorder'] = 'live' self.proxy_record = True else: logging.info( 'Proxy enabled for collection "{0}"'.format(proxy_coll)) self.proxy_record = False proxy_route = proxy_coll if proxy_config.get('enable_content_rewrite', True): self.proxy_prefix = '/{0}/bn_/'.format(proxy_route) else: self.proxy_prefix = '/{0}/id_/'.format(proxy_route) self.proxy_default_timestamp = proxy_config.get('default_timestamp') if self.proxy_default_timestamp: if not self.ALL_DIGITS.match(self.proxy_default_timestamp): try: self.proxy_default_timestamp = iso_date_to_timestamp( self.proxy_default_timestamp) except Exception: raise Exception( 'Invalid Proxy Timestamp: Must Be All-Digit Timestamp or ISO Date Format' ) self.proxy_coll = proxy_coll self.handler = WSGIProxMiddleware(self.handle_request, self.proxy_route_request, proxy_host=proxy_config.get( 'host', 'pywb.proxy'), proxy_options=proxy_config)
def write_snapshot(self, user, coll, url, title, html_text, referrer, user_agent, browser=None): snap_title = 'Static Snapshots' snap_rec = self.sanitize_title(snap_title) if not self.manager.has_recording(user, coll, snap_rec): recording = self.manager.create_recording(user, coll, snap_rec, snap_title) kwargs = dict(user=user, coll=quote(coll), rec=quote(snap_rec, safe='/*'), type='snapshot') params = {'url': url} upstream_url = self.manager.content_app.get_upstream_url( '', kwargs, params) headers = { 'Content-Type': 'text/html; charset=utf-8', 'WARC-User-Agent': user_agent, 'WARC-Referer': referrer, } r = requests.put( upstream_url, data=BytesIO(html_text.encode('utf-8')), headers=headers, ) try: res = r.json() if res['success'] != 'true': print(res) return {'error_message': 'Snapshot Failed'} warc_date = res.get('WARC-Date') except Exception as e: print(e) return {'error_message': 'Snapshot Failed'} if not title: return {'snapshot': ''} if warc_date: timestamp = iso_date_to_timestamp(warc_date) else: timestamp = timestamp_now() page_data = { 'url': url, 'title': title, 'timestamp': timestamp, 'tags': ['snapshot'], } if browser: page_data['browser'] = browser res = self.manager.add_page(user, coll, snap_rec, page_data) return {'snapshot': page_data}
def check_pages_and_text(self, record): url = record.rec_headers.get("WARC-Target-URI") date = record.rec_headers.get("WARC-Date") ts = iso_date_to_timestamp(date) id_ = ts + "/" + url if ( self.main_url and self.main_url == url and self.main_ts and self.main_ts == ts ): self.main_ts_flag = True self.main_url_flag = True print("Found Main Url: {0}".format(url)) print("Found Main ts: {0}".format(ts)) self.pages[id_] = {"timestamp": ts, "url": url, "title": url} if self.main_url and self.main_url == url and self.main_ts == None: self.main_url_flag = True print("Found Main Url: {0}".format(url)) self.pages[id_] = {"timestamp": ts, "url": url, "title": url} mime = self.get_record_mime_type(record) if mime not in HTML_MIME_TYPES: return status = record.http_headers.get_statuscode() if record.http_headers and status.startswith("3"): return if id_ not in self.pages: if self.detect_pages: self.pages[id_] = {"timestamp": ts, "url": url, "title": url} else: return # if not extracting text, then finish here if not self.extract_text: return content = self._read_record(record) if not content: return try: extractor = extractors.ArticleExtractor() content = content.decode("utf-8") doc = extractor.get_doc(content) curr_page = self.pages[id_] if doc.content: self.pages[id_]["text"] = doc.content self.has_text = True # only set title if unset, or set to url (default) # avoid overriding user-specified title, if any if doc.title and self.pages[id_].get("title", url) == url: self.pages[id_]["title"] = doc.title except Exception as e: # skip text extraction in case of errors print("Skipping, Text Extraction Failed For: " + url) print(e)
def write_snapshot(self, user, collection, url, title, html_text, referrer, user_agent, browser=None): snap_title = 'Static Snapshots' snap_rec_name = self.sanitize_title(snap_title) recording = collection.get_recording_by_name(snap_rec_name) if not recording: recording = collection.create_recording(snap_rec_name, title=snap_rec_name) kwargs = dict(user=user.name, coll=collection.my_id, rec=quote(snap_rec_name, safe='/*'), type='put_record') params = {'url': url} upstream_url = self.content_app.get_upstream_url('', kwargs, params) headers = {'Content-Type': 'text/html; charset=utf-8', 'WARC-User-Agent': user_agent, 'WARC-Referer': referrer, } r = requests.put(upstream_url, data=BytesIO(html_text.encode('utf-8')), headers=headers, ) try: res = r.json() if res['success'] != 'true': print(res) return {'error_message': 'Snapshot Failed'} warc_date = res.get('WARC-Date') except Exception as e: print(e) return {'error_message': 'Snapshot Failed'} if not title: return {'snapshot': ''} if warc_date: timestamp = iso_date_to_timestamp(warc_date) else: timestamp = timestamp_now() page_data = {'url': url, 'title': title, 'timestamp': timestamp, 'tags': ['snapshot'], } if browser: page_data['browser'] = browser res = recording.add_page(page_data) return {'snapshot': page_data}