def test_compress_mix(): x = DecompressingBufferedReader(BytesIO(compress('ABC') + b'123'), decomp_type='gzip') b = x.read() assert b == b'ABC' x.read_next_member() assert x.read() == b'123'
def test_compress_invalid(): result = compress('ABCDEFG' * 1) # cut-off part of the block result = result[:-2] + b'xyz' x = DecompressingBufferedReader(BytesIO(result), block_size=16) b = x.read(3) assert b == b'ABC' assert b'DE' == x.read()
def get_rendered_original_stream(warc_filename, warc_offset, compressedendoffset, payload_only=True): """ Grabs a resource. """ # If not found, say so: if warc_filename is None: return None, None # Grab the payload from the WARC and return it. url = "%s%s?op=OPEN&user.name=%s&offset=%s" % ( WEBHDFS_PREFIX, warc_filename, WEBHDFS_USER, warc_offset) if compressedendoffset and int(compressedendoffset) > 0: url = "%s&length=%s" % (url, compressedendoffset) r = requests.get(url, stream=True) # We handle decoding etc. r.raw.decode_content = False logger.debug("Loading from: %s" % r.url) logger.debug("Got status code %s" % r.status_code) # Return the payload, or the record: if payload_only: # Parse the WARC, return the payload: rl = ArcWarcRecordLoader() record = rl.parse_record_stream( DecompressingBufferedReader(stream=r.raw)) #return record.raw_stream, record.content_type return record.content_stream(), record.content_type else: # This makes sure we only get the first GZip chunk: s = DecompressingBufferedReader(stream=r.raw) warc_record = s.read() return warc_record, 'application/warc'
def test_generate_record(self, record_sampler, is_gzip, builder_factory): writer = FixedTestWARCWriter(gzip=is_gzip) builder = builder_factory(writer) record_maker, record_string = record_sampler record = record_maker(builder) writer.write_record(record) raw_buff = writer.get_contents() self._validate_record_content_len(BytesIO(raw_buff)) stream = DecompressingBufferedReader(writer.get_stream()) buff = stream.read() if is_gzip: assert len(buff) > len(raw_buff) else: assert len(buff) == len(raw_buff) assert buff.decode('utf-8') == record_string # assert parsing record matches as well stream = DecompressingBufferedReader(writer.get_stream()) parsed_record = ArcWarcRecordLoader().parse_record_stream(stream) writer2 = FixedTestWARCWriter(gzip=False) writer2.write_record(parsed_record) assert writer2.get_contents().decode('utf-8') == record_string # verify parts of record stream = DecompressingBufferedReader(writer.get_stream()) parsed_record = ArcWarcRecordLoader().parse_record_stream(stream) content_buff = parsed_record.content_stream().read().decode('utf-8') assert content_buff in record_string rec_type = parsed_record.rec_type # verify http_headers # match original assert record.http_headers == parsed_record.http_headers if parsed_record.http_headers: assert rec_type in ('response', 'request', 'revisit') else: # empty revisit if rec_type == 'revisit': assert len(content_buff) == 0 else: assert len(content_buff) == parsed_record.length
def test_generate_response_gzip(self): writer = FixedTestWARCWriter(gzip=True) record = self._sample_response(writer) writer.write_record(record) gzip_buff = writer.get_contents() self._validate_record_content_len(BytesIO(gzip_buff)) stream = writer.get_stream() stream = DecompressingBufferedReader(stream) buff = stream.read() assert len(buff) > len(gzip_buff) assert buff.decode('utf-8') == RESPONSE_RECORD