def test_err_compress_mix(): # error: compressed member, followed by not compressed -- considered invalid x = DecompressingBufferedReader(BytesIO(compress('ABC') + b'123'), decomp_type = 'gzip') b = x.read() b = x.read_next_member() with pytest.raises(zlib.error): x.read()
def iter_records(self, block_size=16384): """ iterate over each record """ decomp_type = 'gzip' self.reader = DecompressingBufferedReader(self.fh, block_size=block_size) self.offset = self.fh.tell() self.next_line = None is_valid = True while True: try: record = self._next_record(self.next_line) if not is_valid: self._raise_err() yield record except EOFError: break self.read_to_end(record) if self.reader.decompressor: is_valid = self.reader.read_next_member()
def test_s3_read_1(): pytest.importorskip('boto') res = BlockLoader().load('s3://aws-publicdatasets/common-crawl/crawl-data/CC-MAIN-2015-11/segments/1424936462700.28/warc/CC-MAIN-20150226074102-00159-ip-10-28-5-156.ec2.internal.warc.gz', offset=53235662, length=2526) buff = res.read() assert len(buff) == 2526 reader = DecompressingBufferedReader(BytesIO(buff)) assert reader.readline() == b'WARC/1.0\r\n' assert reader.readline() == b'WARC-Type: response\r\n'
def test_s3_read_1(): pytest.importorskip('boto') res = BlockLoader().load('s3://commoncrawl/crawl-data/CC-MAIN-2015-11/segments/1424936462700.28/warc/CC-MAIN-20150226074102-00159-ip-10-28-5-156.ec2.internal.warc.gz', offset=53235662, length=2526) buff = res.read() assert len(buff) == 2526 reader = DecompressingBufferedReader(BytesIO(buff)) assert reader.readline() == b'WARC/1.0\r\n' assert reader.readline() == b'WARC-Type: response\r\n'
def __call__(self, block_size=16384): """ iterate over each record """ decomp_type = 'gzip' self.reader = DecompressingBufferedReader(self.fh, block_size=block_size) self.offset = self.fh.tell() self.next_line = None raise_invalid_gzip = False empty_record = False record = None while True: try: curr_offset = self.fh.tell() record = self._next_record(self.next_line) if raise_invalid_gzip: self._raise_invalid_gzip_err() yield record except EOFError: empty_record = True if record: self.read_to_end(record) if self.reader.decompressor: # if another gzip member, continue if self.reader.read_next_member(): continue # if empty record, then we're done elif empty_record: break # otherwise, probably a gzip # containing multiple non-chunked records # raise this as an error else: raise_invalid_gzip = True # non-gzip, so we're done elif empty_record: break
def iter_records(self, block_size=16384): """ iterate over each record """ decomp_type = 'gzip' self.reader = DecompressingBufferedReader(self.fh, block_size=block_size) self.offset = self.fh.tell() self.next_line = None is_valid = True while True: try: record = self._next_record(self.next_line) if not is_valid: self._raise_err() yield record except EOFError: break self.read_to_end(record) if self.reader.decompressor: is_valid = self.reader.read_next_member()
def iter_records(self): """ iterate over each record """ decomp_type = 'gzip' block_size = 16384 self.reader = DecompressingBufferedReader(self.fh, block_size=block_size) self.offset = self.fh.tell() next_line = None while True: try: record = self._next_record(next_line) yield record except EOFError: break self.read_to_end(record) # for non-compressed, consume blank lines here if not self.reader.decompressor: next_line = self._consume_blanklines() if next_line is None: # at end of file break # reset reader for next member else: self.reader.read_next_member()
def test_err_compress_mix(): # error: compressed member, followed by not compressed -- considered invalid x = DecompressingBufferedReader(BytesIO(compress('ABC') + b'123'), decomp_type = 'gzip') b = x.read() assert b == b'ABC' x.read_next_member() assert x.read() == b''
def _check_encoding(self, rewritten_headers, stream, enc): if (rewritten_headers.contains_removed_header('content-encoding', enc)): #optimize: if already a ChunkedDataReader, add the encoding if isinstance(stream, ChunkedDataReader): stream.set_decomp(enc) else: stream = DecompressingBufferedReader(stream, decomp_type=enc) rewritten_headers.status_headers.remove_header('content-length') return stream
def extract_text(entry): buff_reader = entry.buffer if not buff_reader: return b'' buff_reader.seek(0) if entry.record.status_headers.get_header('content-encoding'): buff_reader = DecompressingBufferedReader(buff_reader) buff = b'' while True: new_buff = buff_reader.read() if not new_buff: break buff += new_buff if is_binary_string(buff): return b'' return buff
def iter_records(self, block_size=16384): """ iterate over each record """ decomp_type = 'gzip' self.reader = DecompressingBufferedReader(self.fh, block_size=block_size) self.offset = self.fh.tell() self.next_line = None raise_invalid_gzip = False empty_record = False record = None while True: try: curr_offset = self.fh.tell() record = self._next_record(self.next_line) if raise_invalid_gzip: self._raise_invalid_gzip_err() yield record except EOFError: empty_record = True if record: self.read_to_end(record) if self.reader.decompressor: # if another gzip member, continue if self.reader.read_next_member(): continue # if empty record, then we're done elif empty_record: break # otherwise, probably a gzip # containing multiple non-chunked records # raise this as an error else: raise_invalid_gzip = True # non-gzip, so we're done elif empty_record: break
def load(self, url, offset, length): """ Load a single record from given url at offset with length and parse as either warc or arc record """ try: length = int(length) except: length = -1 stream = self.loader.load(url, long(offset), length) decomp_type = 'gzip' # Create decompressing stream stream = DecompressingBufferedReader(stream=stream, decomp_type=decomp_type, block_size=self.block_size) return self.parse_record_stream(stream)
def get_rendered_original_stream(warc_filename, warc_offset, compressedendoffset): # If not found, say so: if warc_filename is None: return None, None # Grab the payload from the WARC and return it. url = "%s%s?op=OPEN&user.name=%s&offset=%s" % (WEBHDFS_PREFIX, warc_filename, WEBHDFS_USER, warc_offset) if compressedendoffset: url = "%s&length=%s" % (url, compressedendoffset) logger.info("Requesting copy from HDFS: %s " % url) r = requests.get(url, stream=True) logger.info("Loading from: %s" % r.url) r.raw.decode_content = False rl = ArcWarcRecordLoader() logger.info("Passing response to parser...") record = rl.parse_record_stream(DecompressingBufferedReader(stream=r.raw)) logger.info("RESULT:") logger.info(record) logger.info("Returning stream...") return record.stream, record.content_type
def get_rendered_original(url, type='screenshot', target_timestamp=30001201235900): """ Grabs a rendered resource. Only reason Wayback can't do this is that it does not like the extended URIs i.e. 'screenshot:http://' and replaces them with 'http://screenshot:http://' """ # Query URL qurl = "%s:%s" % (type, url) # Query CDX Server for the item #logger.info("Querying CDX for prefix...") warc_filename, warc_offset, compressedendoffset = lookup_in_cdx( qurl, target_timestamp) # If not found, say so: if warc_filename is None: return None # Grab the payload from the WARC and return it. WEBHDFS_PREFIX = os.environ['WEBHDFS_PREFIX'] WEBHDFS_USER = os.environ['WEBHDFS_USER'] url = "%s%s?op=OPEN&user.name=%s&offset=%s" % ( WEBHDFS_PREFIX, warc_filename, WEBHDFS_USER, warc_offset) if compressedendoffset: url = "%s&length=%s" % (url, compressedendoffset) #logger.info("Requesting copy from HDFS: %s " % url) r = requests.get(url, stream=True) #logger.info("Loading from: %s" % r.url) r.raw.decode_content = False rl = ArcWarcRecordLoader() #logger.info("Passing response to parser...") record = rl.parse_record_stream(DecompressingBufferedReader(stream=r.raw)) #logger.info("RESULT:") #logger.info(record) #logger.info("Returning stream...") return (record.stream, record.content_type)
def get_rendered_original(): """ Grabs a rendered resource. Only reason Wayback can't do this is that it does not like the extended URIs i.e. 'screenshot:http://' and replaces them with 'http://screenshot:http://' """ url = request.args.get('url') app.logger.debug("Got URL: %s" % url) # type = request.args.get('type', 'screenshot') app.logger.debug("Got type: %s" % type) # Query URL qurl = "%s:%s" % (type, url) # Query CDX Server for the item (warc_filename, warc_offset) = lookup_in_cdx(qurl) # If not found, say so: if warc_filename is None: abort(404) # Grab the payload from the WARC and return it. r = requests.get("%s%s%s?op=OPEN&user.name=%s&offset=%s" % (systems().webhdfs, h3().hdfs_root_folder, warc_filename, webhdfs().user, warc_offset)) app.logger.info("Loading from: %s" % r.url) r.raw.decode_content = False rl = ArcWarcRecordLoader() record = rl.parse_record_stream( DecompressingBufferedReader(stream=io.BytesIO(r.content))) print(record) print(record.length) print(record.stream.limit) return send_file(record.stream, mimetype=record.content_type)
class ArchiveIterator(object): """ Iterate over records in WARC and ARC files, both gzip chunk compressed and uncompressed The indexer will automatically detect format, and decompress if necessary. """ def __init__(self, fileobj): self.fh = fileobj self.loader = ArcWarcRecordLoader() self.reader = None self.offset = 0 self.known_format = None self.member_info = None def iter_records(self): """ iterate over each record """ decomp_type = 'gzip' block_size = 16384 self.reader = DecompressingBufferedReader(self.fh, block_size=block_size) self.offset = self.fh.tell() next_line = None while True: try: record = self._next_record(next_line) yield record except EOFError: break self.read_to_end(record) # for non-compressed, consume blank lines here if not self.reader.decompressor: next_line = self._consume_blanklines() if next_line is None: # at end of file break # reset reader for next member else: self.reader.read_next_member() def _consume_blanklines(self): """ Consume blank lines that are between records - For warcs, there are usually 2 - For arcs, may be 1 or 0 - For block gzipped files, these are at end of each gzip envelope and are included in record length which is the full gzip envelope - For uncompressed, they are between records and so are NOT part of the record length """ while True: line = self.reader.readline() if len(line) == 0: return None if line.rstrip() == '': self.offset = self.fh.tell() - self.reader.rem_length() continue return line def read_to_end(self, record, compute_digest=False): """ Read remainder of the stream If a digester is included, update it with the data read """ if self.member_info: return self.member_info if compute_digest: digester = hashlib.sha1() else: digester = None num = 0 curr_offset = self.offset while True: b = record.stream.read(8192) if not b: break num += len(b) if digester: digester.update(b) """ - For compressed files, blank lines are consumed since they are part of record length - For uncompressed files, blank lines are read later, and not included in the record length """ if self.reader.decompressor: self._consume_blanklines() self.offset = self.fh.tell() - self.reader.rem_length() length = self.offset - curr_offset if compute_digest: digest = base64.b32encode(digester.digest()) else: digest = None self.member_info = (curr_offset, length, digest) return self.member_info def _next_record(self, next_line): """ Use loader to parse the record from the reader stream Supporting warc and arc records """ record = self.loader.parse_record_stream(self.reader, next_line, self.known_format) self.member_info = None # Track known format for faster parsing of other records self.known_format = record.format return record
def rewrite_content(self, urlrewriter, headers, stream, head_insert_func=None, urlkey='', cdx=None): wb_url = urlrewriter.wburl if (wb_url.is_identity or (not head_insert_func and wb_url.is_banner_only)): status_headers, stream = self.sanitize_content(headers, stream) return (status_headers, self.stream_to_gen(stream), False) if wb_url.is_banner_only: urlrewriter = None rule = self.ruleset.get_first_match(urlkey) (rewritten_headers, stream) = self._rewrite_headers(urlrewriter, rule, headers, stream) status_headers = rewritten_headers.status_headers # use rewritten headers, but no further rewriting needed if rewritten_headers.text_type is None: return (status_headers, self.stream_to_gen(stream), False) # Handle text content rewriting # ==================================================================== # special case -- need to ungzip the body text_type = rewritten_headers.text_type # see known js/css modifier specified, the context should run # default text_type mod = wb_url.mod stream_raw = False encoding = None first_buff = '' if (rewritten_headers. contains_removed_header('content-encoding', 'gzip')): #optimize: if already a ChunkedDataReader, add gzip if isinstance(stream, ChunkedDataReader): stream.set_decomp('gzip') else: stream = DecompressingBufferedReader(stream) if mod == 'js_': text_type, stream = self._resolve_text_type('js', text_type, stream) elif mod == 'cs_': text_type, stream = self._resolve_text_type('css', text_type, stream) rewriter_class = rule.rewriters[text_type] # for html, need to perform header insert, supply js, css, xml # rewriters if text_type == 'html': head_insert_str = '' charset = rewritten_headers.charset # if no charset set, attempt to extract from first 1024 if not rewritten_headers.charset: first_buff = stream.read(1024) charset = self._extract_html_charset(first_buff, status_headers) if head_insert_func: if not charset: charset = 'utf-8' head_insert_str = head_insert_func(rule, cdx) head_insert_str = head_insert_str.encode(charset) if wb_url.is_banner_only: gen = self._head_insert_only_gen(head_insert_str, stream, first_buff) content_len = headers.get_header('Content-Length') try: content_len = int(content_len) except Exception: content_len = None if content_len and content_len >= 0: content_len = str(content_len + len(head_insert_str)) status_headers.replace_header('Content-Length', content_len) return (status_headers, gen, False) rewriter = rewriter_class(urlrewriter, js_rewriter_class=rule.rewriters['js'], css_rewriter_class=rule.rewriters['css'], head_insert=head_insert_str, defmod=self.defmod, parse_comments=rule.parse_comments) else: if wb_url.is_banner_only: return (status_headers, self.stream_to_gen(stream), False) # apply one of (js, css, xml) rewriters rewriter = rewriter_class(urlrewriter) # align to line end for all non-html rewriting align = (text_type != 'html') # Create rewriting generator gen = self.rewrite_text_stream_to_gen(stream, rewrite_func=rewriter.rewrite, final_read_func=rewriter.close, first_buff=first_buff, align_to_line=align) return (status_headers, gen, True)
class ArchiveIterator(object): """ Iterate over records in WARC and ARC files, both gzip chunk compressed and uncompressed The indexer will automatically detect format, and decompress if necessary. """ GZIP_ERR_MSG = """ ERROR: Non-chunked gzip file detected, gzip block continues beyond single record. This file is probably not a multi-chunk gzip but a single gzip file. To allow seek, a gzipped {1} must have each record compressed into a single gzip chunk and concatenated together. This file is likely still valid and you can use it by decompressing it: gunzip myfile.{0}.gz You can then also use the 'warc2warc' tool from the 'warc-tools' package which will create a properly chunked gzip file: warc2warc -Z myfile.{0} > myfile.{0}.gz """ INC_RECORD = """\ WARNING: Record not followed by newline, perhaps Content-Length is invalid Offset: {0} Remainder: {1} """ def __init__(self, fileobj, no_record_parse=False, verify_http=False): self.fh = fileobj self.loader = ArcWarcRecordLoader(verify_http=verify_http) self.reader = None self.offset = 0 self.known_format = None self.member_info = None self.no_record_parse = no_record_parse def iter_records(self, block_size=16384): """ iterate over each record """ decomp_type = 'gzip' self.reader = DecompressingBufferedReader(self.fh, block_size=block_size) self.offset = self.fh.tell() self.next_line = None raise_invalid_gzip = False empty_record = False record = None while True: try: curr_offset = self.fh.tell() record = self._next_record(self.next_line) if raise_invalid_gzip: self._raise_invalid_gzip_err() yield record except EOFError: empty_record = True if record: self.read_to_end(record) if self.reader.decompressor: # if another gzip member, continue if self.reader.read_next_member(): continue # if empty record, then we're done elif empty_record: break # otherwise, probably a gzip # containing multiple non-chunked records # raise this as an error else: raise_invalid_gzip = True # non-gzip, so we're done elif empty_record: break def _raise_invalid_gzip_err(self): """ A gzip file with multiple ARC/WARC records, non-chunked has been detected. This is not valid for replay, so notify user """ frmt = 'warc/arc' if self.known_format: frmt = self.known_format frmt_up = frmt.upper() msg = self.GZIP_ERR_MSG.format(frmt, frmt_up) raise Exception(msg) def _consume_blanklines(self): """ Consume blank lines that are between records - For warcs, there are usually 2 - For arcs, may be 1 or 0 - For block gzipped files, these are at end of each gzip envelope and are included in record length which is the full gzip envelope - For uncompressed, they are between records and so are NOT part of the record length count empty_size so that it can be substracted from the record length for uncompressed if first line read is not blank, likely error in WARC/ARC, display a warning """ empty_size = 0 first_line = True while True: line = self.reader.readline() if len(line) == 0: return None, empty_size stripped = line.rstrip() if stripped == '' or first_line: empty_size += len(line) if stripped != '': # if first line is not blank, # likely content-length was invalid, display warning err_offset = self.fh.tell() - self.reader.rem_length() - empty_size sys.stderr.write(self.INC_RECORD.format(err_offset, line)) first_line = False continue return line, empty_size def read_to_end(self, record, payload_callback=None): """ Read remainder of the stream If a digester is included, update it with the data read """ # already at end of this record, don't read until it is consumed if self.member_info: return None num = 0 curr_offset = self.offset while True: b = record.stream.read(8192) if not b: break num += len(b) if payload_callback: payload_callback(b) """ - For compressed files, blank lines are consumed since they are part of record length - For uncompressed files, blank lines are read later, and not included in the record length """ #if self.reader.decompressor: self.next_line, empty_size = self._consume_blanklines() self.offset = self.fh.tell() - self.reader.rem_length() #if self.offset < 0: # raise Exception('Not Gzipped Properly') if self.next_line: self.offset -= len(self.next_line) length = self.offset - curr_offset if not self.reader.decompressor: length -= empty_size self.member_info = (curr_offset, length) #return self.member_info #return next_line def _next_record(self, next_line): """ Use loader to parse the record from the reader stream Supporting warc and arc records """ record = self.loader.parse_record_stream(self.reader, next_line, self.known_format, self.no_record_parse) self.member_info = None # Track known format for faster parsing of other records self.known_format = record.format return record
def rewrite_content(self, urlrewriter, headers, stream, head_insert_func=None, urlkey='', cdx=None): wb_url = urlrewriter.wburl if (wb_url.is_identity or (not head_insert_func and wb_url.is_banner_only)): status_headers, stream = self.sanitize_content(headers, stream) return (status_headers, self.stream_to_gen(stream), False) if wb_url.is_banner_only: urlrewriter = None rule = self.ruleset.get_first_match(urlkey) (rewritten_headers, stream) = self._rewrite_headers(urlrewriter, rule, headers, stream) status_headers = rewritten_headers.status_headers # use rewritten headers, but no further rewriting needed if rewritten_headers.text_type is None: return (status_headers, self.stream_to_gen(stream), False) # Handle text content rewriting # ==================================================================== # special case -- need to ungzip the body text_type = rewritten_headers.text_type # see known js/css modifier specified, the context should run # default text_type mod = wb_url.mod stream_raw = False encoding = None first_buff = None if (rewritten_headers.contains_removed_header('content-encoding', 'gzip')): #optimize: if already a ChunkedDataReader, add gzip if isinstance(stream, ChunkedDataReader): stream.set_decomp('gzip') else: stream = DecompressingBufferedReader(stream) if mod == 'js_': text_type, stream = self._resolve_text_type( 'js', text_type, stream) elif mod == 'cs_': text_type, stream = self._resolve_text_type( 'css', text_type, stream) rewriter_class = rule.rewriters[text_type] # for html, need to perform header insert, supply js, css, xml # rewriters if text_type == 'html': head_insert_str = '' if head_insert_func: head_insert_str = head_insert_func(rule, cdx) head_insert_str = head_insert_str.encode('utf-8') if wb_url.is_banner_only: gen = self._head_insert_only_gen(head_insert_str, stream) content_len = headers.get_header('Content-Length') try: content_len = int(content_len) except Exception: content_len = None if content_len and content_len >= 0: content_len = str(content_len + len(head_insert_str)) status_headers.replace_header('Content-Length', content_len) return (status_headers, gen, False) rewriter = rewriter_class(urlrewriter, js_rewriter_class=rule.rewriters['js'], css_rewriter_class=rule.rewriters['css'], head_insert=head_insert_str, defmod=self.defmod, parse_comments=rule.parse_comments) else: if wb_url.is_banner_only: return (status_headers, self.stream_to_gen(stream), False) # apply one of (js, css, xml) rewriters rewriter = rewriter_class(urlrewriter) # Create rewriting generator gen = self.rewrite_text_stream_to_gen(stream, rewrite_func=rewriter.rewrite, final_read_func=rewriter.close, first_buff=first_buff) return (status_headers, gen, True)
def test_brotli(): with open(get_test_dir() + 'text_content/quickfox_repeated.compressed', 'rb') as fh: x = DecompressingBufferedReader(fh, decomp_type='br') x.read() == b'The quick brown fox jumps over the lazy dog' * 4096
class ArchiveIterator(object): """ Iterate over records in WARC and ARC files, both gzip chunk compressed and uncompressed The indexer will automatically detect format, and decompress if necessary. """ GZIP_ERR_MSG = """ ERROR: Non-chunked gzip file detected, gzip block continues beyond single record. This file is probably not a multi-chunk gzip but a single gzip file. To allow seek, a gzipped {1} must have each record compressed into a single gzip chunk and concatenated together. This file is likely still valid and you can use it by decompressing it: gunzip myfile.{0}.gz You can then also use the 'warc2warc' tool from the 'warc-tools' package which will create a properly chunked gzip file: warc2warc -Z myfile.{0} > myfile.{0}.gz """ INC_RECORD = """\ WARNING: Record not followed by newline, perhaps Content-Length is invalid Offset: {0} Remainder: {1} """ def __init__(self, fileobj, no_record_parse=False, verify_http=False, arc2warc=False): self.fh = fileobj self.loader = ArcWarcRecordLoader(verify_http=verify_http, arc2warc=arc2warc) self.reader = None self.offset = 0 self.known_format = None self.mixed_arc_warc = arc2warc self.member_info = None self.no_record_parse = no_record_parse def __call__(self, block_size=16384): """ iterate over each record """ decomp_type = 'gzip' self.reader = DecompressingBufferedReader(self.fh, block_size=block_size) self.offset = self.fh.tell() self.next_line = None raise_invalid_gzip = False empty_record = False record = None while True: try: curr_offset = self.fh.tell() record = self._next_record(self.next_line) if raise_invalid_gzip: self._raise_invalid_gzip_err() yield record except EOFError: empty_record = True if record: self.read_to_end(record) if self.reader.decompressor: # if another gzip member, continue if self.reader.read_next_member(): continue # if empty record, then we're done elif empty_record: break # otherwise, probably a gzip # containing multiple non-chunked records # raise this as an error else: raise_invalid_gzip = True # non-gzip, so we're done elif empty_record: break def _raise_invalid_gzip_err(self): """ A gzip file with multiple ARC/WARC records, non-chunked has been detected. This is not valid for replay, so notify user """ frmt = 'warc/arc' if self.known_format: frmt = self.known_format frmt_up = frmt.upper() msg = self.GZIP_ERR_MSG.format(frmt, frmt_up) raise Exception(msg) def _consume_blanklines(self): """ Consume blank lines that are between records - For warcs, there are usually 2 - For arcs, may be 1 or 0 - For block gzipped files, these are at end of each gzip envelope and are included in record length which is the full gzip envelope - For uncompressed, they are between records and so are NOT part of the record length count empty_size so that it can be substracted from the record length for uncompressed if first line read is not blank, likely error in WARC/ARC, display a warning """ empty_size = 0 first_line = True while True: line = self.reader.readline() if len(line) == 0: return None, empty_size stripped = line.rstrip() if len(stripped) == 0 or first_line: empty_size += len(line) if len(stripped) != 0: # if first line is not blank, # likely content-length was invalid, display warning err_offset = self.fh.tell() - self.reader.rem_length( ) - empty_size sys.stderr.write(self.INC_RECORD.format(err_offset, line)) first_line = False continue return line, empty_size def read_to_end(self, record, payload_callback=None): """ Read remainder of the stream If a digester is included, update it with the data read """ # already at end of this record, don't read until it is consumed if self.member_info: return None num = 0 curr_offset = self.offset while True: b = record.stream.read(8192) if not b: break num += len(b) if payload_callback: payload_callback(b) """ - For compressed files, blank lines are consumed since they are part of record length - For uncompressed files, blank lines are read later, and not included in the record length """ #if self.reader.decompressor: self.next_line, empty_size = self._consume_blanklines() self.offset = self.fh.tell() - self.reader.rem_length() #if self.offset < 0: # raise Exception('Not Gzipped Properly') if self.next_line: self.offset -= len(self.next_line) length = self.offset - curr_offset if not self.reader.decompressor: length -= empty_size self.member_info = (curr_offset, length) #return self.member_info #return next_line def _next_record(self, next_line): """ Use loader to parse the record from the reader stream Supporting warc and arc records """ record = self.loader.parse_record_stream(self.reader, next_line, self.known_format, self.no_record_parse) self.member_info = None # Track known format for faster parsing of other records if not self.mixed_arc_warc: self.known_format = record.format return record
class ArchiveIterator(object): """ Iterate over records in WARC and ARC files, both gzip chunk compressed and uncompressed The indexer will automatically detect format, and decompress if necessary. """ GZIP_ERR_MSG = """ ERROR: Non-chunked gzip file detected, gzip block continues beyond single record. This file is probably not a multi-chunk gzip but a single gzip file. To allow seek, a gzipped {1} must have each record compressed into a single gzip chunk and concatenated together. This file is likely still valid and you can use it by decompressing it: gunzip myfile.{0}.gz You can then also use the 'warc2warc' tool from the 'warc-tools' package which will create a properly chunked gzip file: warc2warc -Z myfile.{0} > myfile.{0}.gz """ def __init__(self, fileobj, no_record_parse=False, verify_http=False): self.fh = fileobj self.loader = ArcWarcRecordLoader(verify_http=verify_http) self.reader = None self.offset = 0 self.known_format = None self.member_info = None self.no_record_parse = no_record_parse def iter_records(self, block_size=16384): """ iterate over each record """ decomp_type = 'gzip' self.reader = DecompressingBufferedReader(self.fh, block_size=block_size) self.offset = self.fh.tell() self.next_line = None is_valid = True while True: try: record = self._next_record(self.next_line) if not is_valid: self._raise_err() yield record except EOFError: break self.read_to_end(record) if self.reader.decompressor: is_valid = self.reader.read_next_member() def _raise_err(self): frmt = 'warc/arc' if self.known_format: frmt = self.known_format frmt_up = frmt.upper() msg = self.GZIP_ERR_MSG.format(frmt, frmt_up) raise Exception(msg) def _consume_blanklines(self): """ Consume blank lines that are between records - For warcs, there are usually 2 - For arcs, may be 1 or 0 - For block gzipped files, these are at end of each gzip envelope and are included in record length which is the full gzip envelope - For uncompressed, they are between records and so are NOT part of the record length count empty_size so that it can be substracted from the record length for uncompressed """ empty_size = 0 while True: line = self.reader.readline() if len(line) == 0: return None, empty_size if line.rstrip() == '': empty_size += len(line) continue return line, empty_size def read_to_end(self, record, compute_digest=False): """ Read remainder of the stream If a digester is included, update it with the data read """ # already at end of this record, don't read until it is consumed if self.member_info: return None if compute_digest: digester = hashlib.sha1() else: digester = None num = 0 curr_offset = self.offset while True: b = record.stream.read(8192) if not b: break num += len(b) if digester: digester.update(b) """ - For compressed files, blank lines are consumed since they are part of record length - For uncompressed files, blank lines are read later, and not included in the record length """ #if self.reader.decompressor: self.next_line, empty_size = self._consume_blanklines() self.offset = self.fh.tell() - self.reader.rem_length() #if self.offset < 0: # raise Exception('Not Gzipped Properly') if self.next_line: self.offset -= len(self.next_line) length = self.offset - curr_offset if not self.reader.decompressor: length -= empty_size if compute_digest: digest = base64.b32encode(digester.digest()) else: digest = None self.member_info = (curr_offset, length, digest) #return self.member_info #return next_line def _next_record(self, next_line): """ Use loader to parse the record from the reader stream Supporting warc and arc records """ record = self.loader.parse_record_stream(self.reader, next_line, self.known_format, self.no_record_parse) self.member_info = None # Track known format for faster parsing of other records self.known_format = record.format return record
class ArchiveIterator(object): """ Iterate over records in WARC and ARC files, both gzip chunk compressed and uncompressed The indexer will automatically detect format, and decompress if necessary. """ GZIP_ERR_MSG = """ ERROR: Non-chunked gzip file detected, gzip block continues beyond single record. This file is probably not a multi-chunk gzip but a single gzip file. To allow seek, a gzipped {1} must have each record compressed into a single gzip chunk and concatenated together. This file is likely still valid and you can use it by decompressing it: gunzip myfile.{0}.gz You can then also use the 'warc2warc' tool from the 'warc-tools' package which will create a properly chunked gzip file: warc2warc -Z myfile.{0} > myfile.{0}.gz """ def __init__(self, fileobj, no_record_parse=False): self.fh = fileobj self.loader = ArcWarcRecordLoader() self.reader = None self.offset = 0 self.known_format = None self.member_info = None self.no_record_parse = no_record_parse def iter_records(self, block_size=16384): """ iterate over each record """ decomp_type = 'gzip' self.reader = DecompressingBufferedReader(self.fh, block_size=block_size) self.offset = self.fh.tell() self.next_line = None is_valid = True while True: try: record = self._next_record(self.next_line) if not is_valid: self._raise_err() yield record except EOFError: break self.read_to_end(record) if self.reader.decompressor: is_valid = self.reader.read_next_member() def _raise_err(self): frmt = 'warc/arc' if self.known_format: frmt = self.known_format frmt_up = frmt.upper() msg = self.GZIP_ERR_MSG.format(frmt, frmt_up) raise Exception(msg) def _consume_blanklines(self): """ Consume blank lines that are between records - For warcs, there are usually 2 - For arcs, may be 1 or 0 - For block gzipped files, these are at end of each gzip envelope and are included in record length which is the full gzip envelope - For uncompressed, they are between records and so are NOT part of the record length count empty_size so that it can be substracted from the record length for uncompressed """ empty_size = 0 while True: line = self.reader.readline() if len(line) == 0: return None, empty_size if line.rstrip() == '': empty_size += len(line) continue return line, empty_size def read_to_end(self, record, compute_digest=False): """ Read remainder of the stream If a digester is included, update it with the data read """ # already at end of this record, don't read until it is consumed if self.member_info: return None if compute_digest: digester = hashlib.sha1() else: digester = None num = 0 curr_offset = self.offset while True: b = record.stream.read(8192) if not b: break num += len(b) if digester: digester.update(b) """ - For compressed files, blank lines are consumed since they are part of record length - For uncompressed files, blank lines are read later, and not included in the record length """ #if self.reader.decompressor: self.next_line, empty_size = self._consume_blanklines() self.offset = self.fh.tell() - self.reader.rem_length() #if self.offset < 0: # raise Exception('Not Gzipped Properly') if self.next_line: self.offset -= len(self.next_line) length = self.offset - curr_offset if not self.reader.decompressor: length -= empty_size if compute_digest: digest = base64.b32encode(digester.digest()) else: digest = None self.member_info = (curr_offset, length, digest) #return self.member_info #return next_line def _next_record(self, next_line): """ Use loader to parse the record from the reader stream Supporting warc and arc records """ record = self.loader.parse_record_stream(self.reader, next_line, self.known_format, self.no_record_parse) self.member_info = None # Track known format for faster parsing of other records self.known_format = record.format return record