def test_generate_record(self, record_sampler, is_gzip, builder_factory): writer = FixedTestWARCWriter(gzip=is_gzip) builder = builder_factory(writer) record_maker, record_string = record_sampler record = record_maker(builder) writer.write_record(record) raw_buff = writer.get_contents() self._validate_record_content_len(BytesIO(raw_buff)) stream = DecompressingBufferedReader(writer.get_stream()) buff = stream.read() if is_gzip: assert len(buff) > len(raw_buff) else: assert len(buff) == len(raw_buff) assert buff.decode('utf-8') == record_string # assert parsing record matches as well stream = DecompressingBufferedReader(writer.get_stream()) parsed_record = ArcWarcRecordLoader().parse_record_stream(stream) writer2 = FixedTestWARCWriter(gzip=False) writer2.write_record(parsed_record) assert writer2.get_contents().decode('utf-8') == record_string # verify parts of record stream = DecompressingBufferedReader(writer.get_stream()) parsed_record = ArcWarcRecordLoader().parse_record_stream(stream) content_buff = parsed_record.content_stream().read().decode('utf-8') assert content_buff in record_string rec_type = parsed_record.rec_type # verify http_headers # match original assert record.http_headers == parsed_record.http_headers if parsed_record.http_headers: assert rec_type in ('response', 'request', 'revisit') else: # empty revisit if rec_type == 'revisit': assert len(content_buff) == 0 else: assert len(content_buff) == parsed_record.length
def test_warcinfo_record(self, is_gzip): writer = FixedTestWARCWriter(gzip=is_gzip) record = sample_warcinfo(writer) writer.write_record(record) reader = DecompressingBufferedReader(writer.get_stream()) parsed_record = ArcWarcRecordLoader().parse_record_stream(reader) assert parsed_record.rec_headers.get_header('WARC-Type') == 'warcinfo' assert parsed_record.rec_headers.get_header( 'Content-Type') == 'application/warc-fields' assert parsed_record.rec_headers.get_header( 'WARC-Filename') == 'testfile.warc.gz' buff = parsed_record.content_stream().read().decode('utf-8') assert 'json-metadata: {"foo": "bar"}\r\n' in buff assert 'format: WARC File Format 1.0\r\n' in buff
def test_warcinfo_record(self, is_gzip, builder_factory): writer = FixedTestWARCWriter(gzip=is_gzip) builder = builder_factory(writer) record = sample_warcinfo(builder) writer.write_record(record) reader = DecompressingBufferedReader(writer.get_stream()) parsed_record = ArcWarcRecordLoader().parse_record_stream(reader) assert parsed_record.rec_headers.get_header('WARC-Type') == 'warcinfo' assert parsed_record.rec_headers.get_header( 'Content-Type') == 'application/warc-fields' assert parsed_record.rec_headers.get_header( 'WARC-Filename') == 'testfile.warc.gz' assert parsed_record.rec_headers.get_header( 'WARC-Block-Digest') == 'sha1:GAD6P5BTZPRU57ICXEYUJZGCURZYABID' buff = parsed_record.content_stream().read().decode('utf-8') assert 'json-metadata: {"foo": "bar"}\r\n' in buff assert 'format: WARC File Format 1.0\r\n' in buff