def __init__(self, fileobj, no_record_parse=False, verify_http=False, arc2warc=False, ensure_http_headers=False, block_size=BUFF_SIZE): self.fh = fileobj self.loader = ArcWarcRecordLoader(verify_http=verify_http, arc2warc=arc2warc) self.known_format = None self.mixed_arc_warc = arc2warc self.member_info = None self.no_record_parse = no_record_parse self.ensure_http_headers = ensure_http_headers self.reader = DecompressingBufferedReader(self.fh, block_size=block_size) self.offset = self.fh.tell() self.next_line = None self.err_count = 0 self.record = None self.the_iter = self._iterate_records()
def get_rendered_original_stream(warc_filename, warc_offset, compressedendoffset, payload_only=True): """ Grabs a resource. """ # If not found, say so: if warc_filename is None: return None, None # Grab the payload from the WARC and return it. url = "%s%s?op=OPEN&user.name=%s&offset=%s" % ( WEBHDFS_PREFIX, warc_filename, WEBHDFS_USER, warc_offset) if compressedendoffset and int(compressedendoffset) > 0: url = "%s&length=%s" % (url, compressedendoffset) r = requests.get(url, stream=True) # We handle decoding etc. r.raw.decode_content = False logger.debug("Loading from: %s" % r.url) logger.debug("Got status code %s" % r.status_code) # Return the payload, or the record: if payload_only: # Parse the WARC, return the payload: rl = ArcWarcRecordLoader() record = rl.parse_record_stream( DecompressingBufferedReader(stream=r.raw)) #return record.raw_stream, record.content_type return record.content_stream(), record.content_type else: # This makes sure we only get the first GZip chunk: s = DecompressingBufferedReader(stream=r.raw) warc_record = s.read() return warc_record, 'application/warc'
def test_s3_read_2(): pytest.importorskip('boto3') res = BlockLoader().load('s3://commoncrawl/crawl-data/CC-MAIN-2015-11/index.html') buff = res.read() assert len(buff) == 2082 reader = DecompressingBufferedReader(BytesIO(buff)) assert reader.readline() == b'<!DOCTYPE html>\n'
def test_compress_invalid(): result = compress('ABCDEFG' * 1) # cut-off part of the block result = result[:-2] + b'xyz' x = DecompressingBufferedReader(BytesIO(result), block_size=16) b = x.read(3) assert b == b'ABC' assert b'DE' == x.read()
def test_s3_read_1(): pytest.importorskip('boto3') res = BlockLoader().load('s3://commoncrawl/crawl-data/CC-MAIN-2015-11/segments/1424936462700.28/warc/CC-MAIN-20150226074102-00159-ip-10-28-5-156.ec2.internal.warc.gz', offset=53235662, length=2526) buff = res.read() assert len(buff) == 2526 reader = DecompressingBufferedReader(BytesIO(buff)) assert reader.readline() == b'WARC/1.0\r\n' assert reader.readline() == b'WARC-Type: response\r\n'
def test_record_video_metadata(self): pytest.importorskip('youtube_dl') warc_path = to_path(self.root_dir + '/warcs/{user}/{coll}/') dedup_index = self._get_dedup_index() writer = PerRecordWARCWriter(warc_path, dedup_index=dedup_index) recorder_app = RecorderApp(self.upstream_url, writer) params = {'param.recorder.user': '******', 'param.recorder.coll': 'VIDEO', 'content_type': 'application/vnd.youtube-dl_formats+json' } resp = self._test_warc_write(recorder_app, 'www.youtube.com', '/v/BfBgWtAIbRc', '&' + urlencode(params), link_url='metadata://www.youtube.com/v/BfBgWtAIbRc') r = FakeStrictRedis.from_url('redis://localhost/2') warcs = r.hgetall('USER:VIDEO:warc') assert len(warcs) == 1 filename = list(warcs.values())[0] with open(filename, 'rb') as fh: decomp = DecompressingBufferedReader(fh) record = ArcWarcRecordLoader().parse_record_stream(decomp) status_headers = record.rec_headers assert status_headers.get_header('WARC-Type') == 'metadata' assert status_headers.get_header('Content-Type') == 'application/vnd.youtube-dl_formats+json' assert status_headers.get_header('WARC-Block-Digest') != '' assert status_headers.get_header('WARC-Block-Digest') == status_headers.get_header('WARC-Payload-Digest')
def fetch_warc_record(capture, warc_download_prefix): for field in ('url', 'filename', 'offset', 'length'): if field not in capture: # pragma: no cover raise ValueError('capture must contain '+field) url = capture['url'] filename = capture['filename'] offset = int(capture['offset']) length = int(capture['length']) warc_url = warc_download_prefix + '/' + filename headers = {'Range': 'bytes={}-{}'.format(offset, offset+length-1)} resp = myrequests_get(warc_url, headers=headers) record_bytes = resp.content stream = DecompressingBufferedReader(BytesIO(record_bytes)) record = ArcWarcRecordLoader().parse_record_stream(stream) for header in ('WARC-Source-URI', 'WARC-Source-Range'): if record.rec_headers.get_header(header): # pragma: no cover print('Surprised that {} was already set in this WARC record'.format(header), file=sys.stderr) warc_target_uri = record.rec_headers.get_header('WARC-Target-URI') if url != warc_target_uri: # pragma: no cover print('Surprised that WARC-Target-URI {} is not the capture url {}'.format(warc_target_uri, url), file=sys.stderr) record.rec_headers.replace_header('WARC-Source-URI', warc_url) record.rec_headers.replace_header('WARC-Source-Range', 'bytes={}-{}'.format(offset, offset+length-1)) return record
def test_brotli(): brotli_buff = b'[\xff\xaf\x02\xc0"y\\\xfbZ\x8cB;\xf4%U\x19Z\x92\x99\xb15\xc8\x19\x9e\x9e\n{K\x90\xb9<\x98\xc8\t@\xf3\xe6\xd9M\xe4me\x1b\'\x87\x13_\xa6\xe90\x96{<\x15\xd8S\x1c' with closing( DecompressingBufferedReader(BytesIO(brotli_buff), decomp_type='br')) as x: x.read() == b'The quick brown fox jumps over the lazy dog' * 4096
def test_record_custom_record(self): dedup_index = self._get_dedup_index(user=False) warc_path = to_path(self.root_dir + '/warcs/meta/meta.warc.gz') writer = MultiFileWARCWriter(warc_path, dedup_index=dedup_index) recorder_app = RecorderApp(self.upstream_url, writer) req_url = '/live/resource/postreq?url=custom://httpbin.org¶m.recorder.coll=META&put_record=resource' buff = b'Some Data' testapp = webtest.TestApp(recorder_app) headers = {'content-type': 'text/plain', 'WARC-Custom': 'foo'} resp = testapp.put(req_url, headers=headers, params=buff) assert resp.json['success'] == 'true' assert resp.json['WARC-Date'] != '' self._test_all_warcs('/warcs/meta', 1) r = FakeStrictRedis.from_url('redis://localhost/2') warcs = r.hgetall('META:warc') assert len(warcs) == 1 warc_key = os.path.join('meta', 'meta.warc.gz').encode('utf-8') with open(warcs[warc_key], 'rb') as fh: decomp = DecompressingBufferedReader(fh) record = ArcWarcRecordLoader().parse_record_stream( decomp, ensure_http_headers=True) status_headers = record.rec_headers assert len(record.rec_headers.headers) == 9 assert status_headers.get_header('WARC-Type') == 'resource' assert status_headers.get_header( 'WARC-Target-URI') == 'custom://httpbin.org' assert status_headers.get_header('WARC-Record-ID') != '' assert status_headers.get_header('WARC-Date') != '' assert status_headers.get_header('WARC-Block-Digest') != '' assert status_headers.get_header( 'WARC-Block-Digest') == status_headers.get_header( 'WARC-Payload-Digest') assert status_headers.get_header('Content-Type') == 'text/plain' assert status_headers.get_header('Content-Length') == str(len(buff)) assert status_headers.get_header('WARC-Custom') == 'foo' assert record.raw_stream.read() == buff status_headers = record.http_headers assert len(record.http_headers.headers) == 2 assert status_headers.get_header('Content-Type') == 'text/plain' assert status_headers.get_header('Content-Length') == str(len(buff)) writer.close() assert len(writer.fh_cache) == 0
def test_generate_response_gzip(self): writer = FixedTestWARCWriter(gzip=True) record = self._sample_response(writer) writer.write_record(record) gzip_buff = writer.get_contents() self._validate_record_content_len(BytesIO(gzip_buff)) stream = writer.get_stream() stream = DecompressingBufferedReader(stream) buff = stream.read() assert len(buff) > len(gzip_buff) assert buff.decode('utf-8') == RESPONSE_RECORD
def test_record_param_user_coll_revisit(self): warc_path = to_path(self.root_dir + '/warcs/{user}/{coll}/') dedup_index = self._get_dedup_index() recorder_app = RecorderApp( self.upstream_url, PerRecordWARCWriter(warc_path, dedup_index=dedup_index)) self._test_all_warcs('/warcs/USER/COLL/', 1) resp = self._test_warc_write( recorder_app, 'httpbin.org', '/user-agent', '¶m.recorder.user=USER¶m.recorder.coll=COLL') assert '"user-agent": "{0}"'.format(UA) in resp.text #assert b'HTTP/1.1 200 OK' in resp.body #assert b'"foo": "bar"' in resp.body self._test_all_warcs('/warcs/USER/COLL/', 2) # Test Redis CDX r = FakeStrictRedis.from_url('redis://localhost/2') res = r.zrangebylex('USER:COLL:cdxj', '[org,httpbin)/', '(org,httpbin,') assert len(res) == 2 if b'warc/revisit' in res[0]: cdx = CDXObject(res[0]) else: cdx = CDXObject(res[1]) assert cdx['urlkey'] == 'org,httpbin)/user-agent' assert cdx['mime'] == 'warc/revisit' assert cdx['offset'] == '0' assert cdx['filename'].startswith(to_path('USER/COLL/')) assert cdx['filename'].endswith('.warc.gz') fullwarc = os.path.join(self.root_dir, 'warcs', cdx['filename']) warcs = r.hgetall('USER:COLL:warc') assert len(warcs) == 2 assert warcs[cdx['filename'].encode('utf-8')] == fullwarc.encode( 'utf-8') with open(fullwarc, 'rb') as fh: decomp = DecompressingBufferedReader(fh) # Test refers-to headers status_headers = StatusAndHeadersParser(['WARC/1.0']).parse(decomp) assert status_headers.get_header('WARC-Type') == 'revisit' assert status_headers.get_header( 'WARC-Target-URI') == 'http://httpbin.org/user-agent' assert status_headers.get_header('WARC-Date') != '' assert status_headers.get_header( 'WARC-Refers-To-Target-URI') == 'http://httpbin.org/user-agent' assert status_headers.get_header('WARC-Refers-To-Date') != ''
def test_read_from_stream_no_content_length(self, record_sampler, is_gzip, builder_factory): writer = FixedTestWARCWriter(gzip=is_gzip) builder = builder_factory(writer) record_maker, record_string = record_sampler full_record = record_maker(builder) stream = BytesIO() record_no_cl = self._conv_to_streaming_record(record_string, full_record.rec_type) if is_gzip: gzip_stream = GzippingWrapper(stream) gzip_stream.write(record_no_cl.encode('utf-8')) gzip_stream.flush() else: stream.write(record_no_cl.encode('utf-8')) # parse to verify http headers + payload matches sample record # but not rec headers (missing content-length) stream.seek(0) parsed_record = ArcWarcRecordLoader().parse_record_stream( DecompressingBufferedReader(stream)) if 'Content-Disposition' not in record_string: assert full_record.http_headers == parsed_record.http_headers assert full_record.raw_stream.read() == parsed_record.raw_stream.read() assert full_record.rec_headers != parsed_record.rec_headers # parse and write stream.seek(0) parsed_record = ArcWarcRecordLoader().parse_record_stream( DecompressingBufferedReader(stream)) writer.write_record(parsed_record) stream = DecompressingBufferedReader(writer.get_stream()) buff = stream.read() # assert written record matches expected response record # with content-length, digests computed assert buff.decode('utf-8') == record_string
def test_brotli_very_small_chunk(): brotli_buff = b'[\xff\xaf\x02\xc0"y\\\xfbZ\x8cB;\xf4%U\x19Z\x92\x99\xb15\xc8\x19\x9e\x9e\n{K\x90\xb9<\x98\xc8\t@\xf3\xe6\xd9M\xe4me\x1b\'\x87\x13_\xa6\xe90\x96{<\x15\xd8S\x1c' # read 3 bytes at time, will need to read() multiple types before decompressor has enough to return something with closing( DecompressingBufferedReader(BytesIO(brotli_buff), decomp_type='br', block_size=3)) as x: assert x.read( ) == b'The quick brown fox jumps over the lazy dog' * 4096
def lookupRecord(url): """ Look up URL in database. """ try: filename, offset, length = urlmap[url] with open(filename, 'rb') as stream: stream.seek(offset, 0) buf = BytesIO(stream.read(length)) loader = ArcWarcRecordLoader() return loader.parse_record_stream(DecompressingBufferedReader(buf)) except KeyError: return None
def decompress_and_recompress(self, stream, output): with tempfile.TemporaryFile() as tout: decomp = DecompressingBufferedReader(stream) # decompress entire file to temp file stream.seek(0) shutil.copyfileobj(decomp, tout) # attempt to compress and write temp tout.seek(0) self.load_and_write(tout, output)
def test_compress_mix(): x = DecompressingBufferedReader(BytesIO(compress('ABC') + b'123'), decomp_type='gzip') b = x.read() assert b == b'ABC' x.read_next_member() assert x.read() == b'123'
def test_generate_record(self, record_sampler, is_gzip, builder_factory): writer = FixedTestWARCWriter(gzip=is_gzip) builder = builder_factory(writer) record_maker, record_string = record_sampler record = record_maker(builder) writer.write_record(record) raw_buff = writer.get_contents() self._validate_record_content_len(BytesIO(raw_buff)) stream = DecompressingBufferedReader(writer.get_stream()) buff = stream.read() if is_gzip: assert len(buff) > len(raw_buff) else: assert len(buff) == len(raw_buff) assert buff.decode('utf-8') == record_string # assert parsing record matches as well stream = DecompressingBufferedReader(writer.get_stream()) parsed_record = ArcWarcRecordLoader().parse_record_stream(stream) writer2 = FixedTestWARCWriter(gzip=False) writer2.write_record(parsed_record) assert writer2.get_contents().decode('utf-8') == record_string # verify parts of record stream = DecompressingBufferedReader(writer.get_stream()) parsed_record = ArcWarcRecordLoader().parse_record_stream(stream) content_buff = parsed_record.content_stream().read().decode('utf-8') assert content_buff in record_string rec_type = parsed_record.rec_type # verify http_headers # match original assert record.http_headers == parsed_record.http_headers if parsed_record.http_headers: assert rec_type in ('response', 'request', 'revisit') else: # empty revisit if rec_type == 'revisit': assert len(content_buff) == 0 else: assert len(content_buff) == parsed_record.length
def get_rendered_original_stream(warc_filename, warc_offset, compressedendoffset): # If not found, say so: if warc_filename is None: return None, None # Grab the payload from the WARC and return it. url = "%s%s?op=OPEN&user.name=%s&offset=%s" % ( WEBHDFS_PREFIX, warc_filename, WEBHDFS_USER, warc_offset) if compressedendoffset: url = "%s&length=%s" % (url, compressedendoffset) r = requests.get(url, stream=True) logger.debug("Loading from: %s" % r.url) r.raw.decode_content = False rl = ArcWarcRecordLoader() record = rl.parse_record_stream(DecompressingBufferedReader(stream=r.raw)) return record.raw_stream, record.content_type
def load(self, url, offset, length, no_record_parse=False): """ Load a single record from given url at offset with length and parse as either warc or arc record """ try: length = int(length) except: length = -1 stream = self.loader.load(url, int(offset), length) decomp_type = 'gzip' # Create decompressing stream stream = DecompressingBufferedReader(stream=stream, decomp_type=decomp_type, block_size=self.block_size) return self.parse_record_stream(stream, no_record_parse=no_record_parse)
def test_warcinfo_record(self, is_gzip): writer = FixedTestWARCWriter(gzip=is_gzip) record = sample_warcinfo(writer) writer.write_record(record) reader = DecompressingBufferedReader(writer.get_stream()) parsed_record = ArcWarcRecordLoader().parse_record_stream(reader) assert parsed_record.rec_headers.get_header('WARC-Type') == 'warcinfo' assert parsed_record.rec_headers.get_header( 'Content-Type') == 'application/warc-fields' assert parsed_record.rec_headers.get_header( 'WARC-Filename') == 'testfile.warc.gz' buff = parsed_record.content_stream().read().decode('utf-8') assert 'json-metadata: {"foo": "bar"}\r\n' in buff assert 'format: WARC File Format 1.0\r\n' in buff
def test_warcinfo_record(self, is_gzip, builder_factory): writer = FixedTestWARCWriter(gzip=is_gzip) builder = builder_factory(writer) record = sample_warcinfo(builder) writer.write_record(record) reader = DecompressingBufferedReader(writer.get_stream()) parsed_record = ArcWarcRecordLoader().parse_record_stream(reader) assert parsed_record.rec_headers.get_header('WARC-Type') == 'warcinfo' assert parsed_record.rec_headers.get_header( 'Content-Type') == 'application/warc-fields' assert parsed_record.rec_headers.get_header( 'WARC-Filename') == 'testfile.warc.gz' assert parsed_record.rec_headers.get_header( 'WARC-Block-Digest') == 'sha1:GAD6P5BTZPRU57ICXEYUJZGCURZYABID' buff = parsed_record.content_stream().read().decode('utf-8') assert 'json-metadata: {"foo": "bar"}\r\n' in buff assert 'format: WARC File Format 1.0\r\n' in buff
class ArchiveIterator(six.Iterator): """ Iterate over records in WARC and ARC files, both gzip chunk compressed and uncompressed The indexer will automatically detect format, and decompress if necessary. """ GZIP_ERR_MSG = """ ERROR: non-chunked gzip file detected, gzip block continues beyond single record. This file is probably not a multi-member gzip but a single gzip file. To allow seek, a gzipped {1} must have each record compressed into a single gzip member and concatenated together. This file is likely still valid and can be fixed by running: warcio recompress <path/to/file> <path/to/new_file> """ INC_RECORD = """\ WARNING: Record not followed by newline, perhaps Content-Length is invalid Offset: {0} Remainder: {1} """ def __init__(self, fileobj, no_record_parse=False, verify_http=False, arc2warc=False, ensure_http_headers=False, block_size=BUFF_SIZE): self.fh = fileobj self.loader = ArcWarcRecordLoader(verify_http=verify_http, arc2warc=arc2warc) self.known_format = None self.mixed_arc_warc = arc2warc self.member_info = None self.no_record_parse = no_record_parse self.ensure_http_headers = ensure_http_headers self.reader = DecompressingBufferedReader(self.fh, block_size=block_size) self.offset = self.fh.tell() self.next_line = None self.err_count = 0 self.the_iter = self._iterate_records() def __iter__(self): return self.the_iter def __next__(self): return six.next(self.the_iter) def _iterate_records(self): """ iterate over each record """ raise_invalid_gzip = False empty_record = False record = None while True: try: record = self._next_record(self.next_line) if raise_invalid_gzip: self._raise_invalid_gzip_err() yield record except EOFError: empty_record = True if record: self.read_to_end(record) if self.reader.decompressor: # if another gzip member, continue if self.reader.read_next_member(): continue # if empty record, then we're done elif empty_record: break # otherwise, probably a gzip # containing multiple non-chunked records # raise this as an error else: raise_invalid_gzip = True # non-gzip, so we're done elif empty_record: break def _raise_invalid_gzip_err(self): """ A gzip file with multiple ARC/WARC records, non-chunked has been detected. This is not valid for replay, so notify user """ frmt = 'warc/arc' if self.known_format: frmt = self.known_format frmt_up = frmt.upper() msg = self.GZIP_ERR_MSG.format(frmt, frmt_up) raise ArchiveLoadFailed(msg) def _consume_blanklines(self): """ Consume blank lines that are between records - For warcs, there are usually 2 - For arcs, may be 1 or 0 - For block gzipped files, these are at end of each gzip envelope and are included in record length which is the full gzip envelope - For uncompressed, they are between records and so are NOT part of the record length count empty_size so that it can be substracted from the record length for uncompressed if first line read is not blank, likely error in WARC/ARC, display a warning """ empty_size = 0 first_line = True while True: line = self.reader.readline() if len(line) == 0: return None, empty_size stripped = line.rstrip() if len(stripped) == 0 or first_line: empty_size += len(line) if len(stripped) != 0: # if first line is not blank, # likely content-length was invalid, display warning err_offset = self.fh.tell() - self.reader.rem_length() - empty_size sys.stderr.write(self.INC_RECORD.format(err_offset, line)) self.err_count += 1 first_line = False continue return line, empty_size def read_to_end(self, record): """ Read remainder of the stream If a digester is included, update it with the data read """ # already at end of this record, don't read until it is consumed if self.member_info: return None curr_offset = self.offset while True: b = record.raw_stream.read(BUFF_SIZE) if not b: break """ - For compressed files, blank lines are consumed since they are part of record length - For uncompressed files, blank lines are read later, and not included in the record length """ #if self.reader.decompressor: self.next_line, empty_size = self._consume_blanklines() self.offset = self.fh.tell() - self.reader.rem_length() #if self.offset < 0: # raise Exception('Not Gzipped Properly') if self.next_line: self.offset -= len(self.next_line) length = self.offset - curr_offset if not self.reader.decompressor: length -= empty_size self.member_info = (curr_offset, length) #return self.member_info #return next_line def _next_record(self, next_line): """ Use loader to parse the record from the reader stream Supporting warc and arc records """ record = self.loader.parse_record_stream(self.reader, next_line, self.known_format, self.no_record_parse, self.ensure_http_headers) self.member_info = None # Track known format for faster parsing of other records if not self.mixed_arc_warc: self.known_format = record.format return record