def iter_records(self, block_size=16384): """ iterate over each record """ decomp_type = 'gzip' self.reader = DecompressingBufferedReader(self.fh, block_size=block_size) self.offset = self.fh.tell() self.next_line = None is_valid = True while True: try: record = self._next_record(self.next_line) if not is_valid: self._raise_err() yield record except EOFError: break self.read_to_end(record) if self.reader.decompressor: is_valid = self.reader.read_next_member()
def test_err_compress_mix(): # error: compressed member, followed by not compressed -- considered invalid x = DecompressingBufferedReader(BytesIO(compress('ABC') + b'123'), decomp_type = 'gzip') b = x.read() assert b == b'ABC' x.read_next_member() assert x.read() == b''
def test_s3_read_1(): pytest.importorskip('boto') res = BlockLoader().load('s3://commoncrawl/crawl-data/CC-MAIN-2015-11/segments/1424936462700.28/warc/CC-MAIN-20150226074102-00159-ip-10-28-5-156.ec2.internal.warc.gz', offset=53235662, length=2526) buff = res.read() assert len(buff) == 2526 reader = DecompressingBufferedReader(BytesIO(buff)) assert reader.readline() == b'WARC/1.0\r\n' assert reader.readline() == b'WARC-Type: response\r\n'
def _check_encoding(self, rewritten_headers, stream, enc): if (rewritten_headers.contains_removed_header('content-encoding', enc)): #optimize: if already a ChunkedDataReader, add the encoding if isinstance(stream, ChunkedDataReader): stream.set_decomp(enc) else: stream = DecompressingBufferedReader(stream, decomp_type=enc) rewritten_headers.status_headers.remove_header('content-length') return stream
def __call__(self, block_size=16384): """ iterate over each record """ decomp_type = 'gzip' self.reader = DecompressingBufferedReader(self.fh, block_size=block_size) self.offset = self.fh.tell() self.next_line = None raise_invalid_gzip = False empty_record = False record = None while True: try: curr_offset = self.fh.tell() record = self._next_record(self.next_line) if raise_invalid_gzip: self._raise_invalid_gzip_err() yield record except EOFError: empty_record = True if record: self.read_to_end(record) if self.reader.decompressor: # if another gzip member, continue if self.reader.read_next_member(): continue # if empty record, then we're done elif empty_record: break # otherwise, probably a gzip # containing multiple non-chunked records # raise this as an error else: raise_invalid_gzip = True # non-gzip, so we're done elif empty_record: break
def load(self, url, offset, length): """ Load a single record from given url at offset with length and parse as either warc or arc record """ try: length = int(length) except: length = -1 stream = self.loader.load(url, long(offset), length) decomp_type = 'gzip' # Create decompressing stream stream = DecompressingBufferedReader(stream=stream, decomp_type=decomp_type, block_size=self.block_size) return self.parse_record_stream(stream)
def get_rendered_original_stream(warc_filename, warc_offset, compressedendoffset): # If not found, say so: if warc_filename is None: return None, None # Grab the payload from the WARC and return it. url = "%s%s?op=OPEN&user.name=%s&offset=%s" % (WEBHDFS_PREFIX, warc_filename, WEBHDFS_USER, warc_offset) if compressedendoffset: url = "%s&length=%s" % (url, compressedendoffset) logger.info("Requesting copy from HDFS: %s " % url) r = requests.get(url, stream=True) logger.info("Loading from: %s" % r.url) r.raw.decode_content = False rl = ArcWarcRecordLoader() logger.info("Passing response to parser...") record = rl.parse_record_stream(DecompressingBufferedReader(stream=r.raw)) logger.info("RESULT:") logger.info(record) logger.info("Returning stream...") return record.stream, record.content_type
def get_rendered_original(url, type='screenshot', target_timestamp=30001201235900): """ Grabs a rendered resource. Only reason Wayback can't do this is that it does not like the extended URIs i.e. 'screenshot:http://' and replaces them with 'http://screenshot:http://' """ # Query URL qurl = "%s:%s" % (type, url) # Query CDX Server for the item #logger.info("Querying CDX for prefix...") warc_filename, warc_offset, compressedendoffset = lookup_in_cdx( qurl, target_timestamp) # If not found, say so: if warc_filename is None: return None # Grab the payload from the WARC and return it. WEBHDFS_PREFIX = os.environ['WEBHDFS_PREFIX'] WEBHDFS_USER = os.environ['WEBHDFS_USER'] url = "%s%s?op=OPEN&user.name=%s&offset=%s" % ( WEBHDFS_PREFIX, warc_filename, WEBHDFS_USER, warc_offset) if compressedendoffset: url = "%s&length=%s" % (url, compressedendoffset) #logger.info("Requesting copy from HDFS: %s " % url) r = requests.get(url, stream=True) #logger.info("Loading from: %s" % r.url) r.raw.decode_content = False rl = ArcWarcRecordLoader() #logger.info("Passing response to parser...") record = rl.parse_record_stream(DecompressingBufferedReader(stream=r.raw)) #logger.info("RESULT:") #logger.info(record) #logger.info("Returning stream...") return (record.stream, record.content_type)
def extract_text(entry): buff_reader = entry.buffer if not buff_reader: return b'' buff_reader.seek(0) if entry.record.status_headers.get_header('content-encoding'): buff_reader = DecompressingBufferedReader(buff_reader) buff = b'' while True: new_buff = buff_reader.read() if not new_buff: break buff += new_buff if is_binary_string(buff): return b'' return buff
def get_rendered_original(): """ Grabs a rendered resource. Only reason Wayback can't do this is that it does not like the extended URIs i.e. 'screenshot:http://' and replaces them with 'http://screenshot:http://' """ url = request.args.get('url') app.logger.debug("Got URL: %s" % url) # type = request.args.get('type', 'screenshot') app.logger.debug("Got type: %s" % type) # Query URL qurl = "%s:%s" % (type, url) # Query CDX Server for the item (warc_filename, warc_offset) = lookup_in_cdx(qurl) # If not found, say so: if warc_filename is None: abort(404) # Grab the payload from the WARC and return it. r = requests.get("%s%s%s?op=OPEN&user.name=%s&offset=%s" % (systems().webhdfs, h3().hdfs_root_folder, warc_filename, webhdfs().user, warc_offset)) app.logger.info("Loading from: %s" % r.url) r.raw.decode_content = False rl = ArcWarcRecordLoader() record = rl.parse_record_stream( DecompressingBufferedReader(stream=io.BytesIO(r.content))) print(record) print(record.length) print(record.stream.limit) return send_file(record.stream, mimetype=record.content_type)
def rewrite_content(self, urlrewriter, headers, stream, head_insert_func=None, urlkey='', cdx=None): wb_url = urlrewriter.wburl if (wb_url.is_identity or (not head_insert_func and wb_url.is_banner_only)): status_headers, stream = self.sanitize_content(headers, stream) return (status_headers, self.stream_to_gen(stream), False) if wb_url.is_banner_only: urlrewriter = None rule = self.ruleset.get_first_match(urlkey) (rewritten_headers, stream) = self._rewrite_headers(urlrewriter, rule, headers, stream) status_headers = rewritten_headers.status_headers # use rewritten headers, but no further rewriting needed if rewritten_headers.text_type is None: return (status_headers, self.stream_to_gen(stream), False) # Handle text content rewriting # ==================================================================== # special case -- need to ungzip the body text_type = rewritten_headers.text_type # see known js/css modifier specified, the context should run # default text_type mod = wb_url.mod stream_raw = False encoding = None first_buff = None if (rewritten_headers.contains_removed_header('content-encoding', 'gzip')): #optimize: if already a ChunkedDataReader, add gzip if isinstance(stream, ChunkedDataReader): stream.set_decomp('gzip') else: stream = DecompressingBufferedReader(stream) if mod == 'js_': text_type, stream = self._resolve_text_type( 'js', text_type, stream) elif mod == 'cs_': text_type, stream = self._resolve_text_type( 'css', text_type, stream) rewriter_class = rule.rewriters[text_type] # for html, need to perform header insert, supply js, css, xml # rewriters if text_type == 'html': head_insert_str = '' if head_insert_func: head_insert_str = head_insert_func(rule, cdx) head_insert_str = head_insert_str.encode('utf-8') if wb_url.is_banner_only: gen = self._head_insert_only_gen(head_insert_str, stream) content_len = headers.get_header('Content-Length') try: content_len = int(content_len) except Exception: content_len = None if content_len and content_len >= 0: content_len = str(content_len + len(head_insert_str)) status_headers.replace_header('Content-Length', content_len) return (status_headers, gen, False) rewriter = rewriter_class(urlrewriter, js_rewriter_class=rule.rewriters['js'], css_rewriter_class=rule.rewriters['css'], head_insert=head_insert_str, defmod=self.defmod, parse_comments=rule.parse_comments) else: if wb_url.is_banner_only: return (status_headers, self.stream_to_gen(stream), False) # apply one of (js, css, xml) rewriters rewriter = rewriter_class(urlrewriter) # Create rewriting generator gen = self.rewrite_text_stream_to_gen(stream, rewrite_func=rewriter.rewrite, final_read_func=rewriter.close, first_buff=first_buff) return (status_headers, gen, True)
def test_brotli(): with open(get_test_dir() + 'text_content/quickfox_repeated.compressed', 'rb') as fh: x = DecompressingBufferedReader(fh, decomp_type='br') x.read() == b'The quick brown fox jumps over the lazy dog' * 4096