def load_test_archive(test_file, offset, length): path = test_warc_dir + test_file testloader = ArcWarcRecordLoader() archive = testloader.load(path, offset, length) pprint.pprint(((archive.format, archive.rec_type), archive.rec_headers, archive.status_headers))
def _init_replay_view(self, config): cookie_maker = config.get('cookie_maker') record_loader = ArcWarcRecordLoader(cookie_maker=cookie_maker) paths = config.get('archive_paths') resolving_loader = ResolvingLoader(PathResolverMapper()(paths), record_loader=record_loader) return ReplayView(resolving_loader, config)
def __init__(self, fileobj, no_record_parse=False, verify_http=False): self.fh = fileobj self.loader = ArcWarcRecordLoader(verify_http=verify_http) self.reader = None self.offset = 0 self.known_format = None self.member_info = None self.no_record_parse = no_record_parse
def load_test_archive(test_file, offset, length): path = test_warc_dir + test_file testloader = ArcWarcRecordLoader() archive = testloader.load(path, offset, length) pywb.utils.statusandheaders.WRAP_WIDTH = 160 pprint.pprint(((archive.format, archive.rec_type), archive.rec_headers, archive.status_headers), indent=1, width=160)
def _init_replay_view(self, config): cookie_maker = config.get('cookie_maker') record_loader = ArcWarcRecordLoader(cookie_maker=cookie_maker) paths = config.get('archive_paths') resolving_loader = ResolvingLoader(PathResolverMapper()(paths), record_loader=record_loader) redis_warc_resolver = config.get('redis_warc_resolver') if redis_warc_resolver: resolving_loader.path_resolvers.append(redis_warc_resolver) return WebRecReplayView(resolving_loader, config)
def __init__(self, query_handler, config=None): super(WBHandler, self).__init__(config) self.index_reader = query_handler cookie_maker = config.get('cookie_maker') record_loader = ArcWarcRecordLoader(cookie_maker=cookie_maker) paths = config.get('archive_paths') resolving_loader = ResolvingLoader(paths=paths, record_loader=record_loader) self.replay = ReplayView(resolving_loader, config) self.fallback_handler = None self.fallback_name = config.get('fallback')
def get_rendered_original_stream(warc_filename, warc_offset, compressedendoffset): # If not found, say so: if warc_filename is None: return None, None # Grab the payload from the WARC and return it. url = "%s%s?op=OPEN&user.name=%s&offset=%s" % (WEBHDFS_PREFIX, warc_filename, WEBHDFS_USER, warc_offset) if compressedendoffset: url = "%s&length=%s" % (url, compressedendoffset) logger.info("Requesting copy from HDFS: %s " % url) r = requests.get(url, stream=True) logger.info("Loading from: %s" % r.url) r.raw.decode_content = False rl = ArcWarcRecordLoader() logger.info("Passing response to parser...") record = rl.parse_record_stream(DecompressingBufferedReader(stream=r.raw)) logger.info("RESULT:") logger.info(record) logger.info("Returning stream...") return record.stream, record.content_type
def get_rendered_original(url, type='screenshot', target_timestamp=30001201235900): """ Grabs a rendered resource. Only reason Wayback can't do this is that it does not like the extended URIs i.e. 'screenshot:http://' and replaces them with 'http://screenshot:http://' """ # Query URL qurl = "%s:%s" % (type, url) # Query CDX Server for the item #logger.info("Querying CDX for prefix...") warc_filename, warc_offset, compressedendoffset = lookup_in_cdx( qurl, target_timestamp) # If not found, say so: if warc_filename is None: return None # Grab the payload from the WARC and return it. WEBHDFS_PREFIX = os.environ['WEBHDFS_PREFIX'] WEBHDFS_USER = os.environ['WEBHDFS_USER'] url = "%s%s?op=OPEN&user.name=%s&offset=%s" % ( WEBHDFS_PREFIX, warc_filename, WEBHDFS_USER, warc_offset) if compressedendoffset: url = "%s&length=%s" % (url, compressedendoffset) #logger.info("Requesting copy from HDFS: %s " % url) r = requests.get(url, stream=True) #logger.info("Loading from: %s" % r.url) r.raw.decode_content = False rl = ArcWarcRecordLoader() #logger.info("Passing response to parser...") record = rl.parse_record_stream(DecompressingBufferedReader(stream=r.raw)) #logger.info("RESULT:") #logger.info(record) #logger.info("Returning stream...") return (record.stream, record.content_type)
def get_rendered_original(): """ Grabs a rendered resource. Only reason Wayback can't do this is that it does not like the extended URIs i.e. 'screenshot:http://' and replaces them with 'http://screenshot:http://' """ url = request.args.get('url') app.logger.debug("Got URL: %s" % url) # type = request.args.get('type', 'screenshot') app.logger.debug("Got type: %s" % type) # Query URL qurl = "%s:%s" % (type, url) # Query CDX Server for the item (warc_filename, warc_offset) = lookup_in_cdx(qurl) # If not found, say so: if warc_filename is None: abort(404) # Grab the payload from the WARC and return it. r = requests.get("%s%s%s?op=OPEN&user.name=%s&offset=%s" % (systems().webhdfs, h3().hdfs_root_folder, warc_filename, webhdfs().user, warc_offset)) app.logger.info("Loading from: %s" % r.url) r.raw.decode_content = False rl = ArcWarcRecordLoader() record = rl.parse_record_stream( DecompressingBufferedReader(stream=io.BytesIO(r.content))) print(record) print(record.length) print(record.stream.limit) return send_file(record.stream, mimetype=record.content_type)
def __init__(self, path_resolvers, record_loader=ArcWarcRecordLoader(), no_record_parse=False): self.path_resolvers = path_resolvers self.record_loader = record_loader self.no_record_parse = no_record_parse
def parse_stream_error(**params): try: return ArcWarcRecordLoader().parse_record_stream(**params) except Exception as e: print 'Exception: ' + e.__class__.__name__