def make_detailed_warcinfo(filename, guid, coll_title, coll_desc, rec_title, pages): # # # Thank you! Rhizome/Webrecorder.io/Ilya Kreymer # # coll_metadata = {'type': 'collection', 'title': coll_title, 'desc': coll_desc} rec_metadata = {'type': 'recording', 'title': rec_title, 'pages': pages} # Coll info writer = BufferWARCWriter(gzip=True) params = OrderedDict([('operator', 'Perma.cc download'), ('Perma-GUID', guid), ('format', 'WARC File Format 1.0'), ('json-metadata', json.dumps(coll_metadata))]) record = writer.create_warcinfo_record(filename, params) writer.write_record(record) # Rec Info params['json-metadata'] = json.dumps(rec_metadata) record = writer.create_warcinfo_record(filename, params) writer.write_record(record) return writer.get_contents()
def create_warcinfo(self, creator, title, metadata, source, filename): for name, value in iteritems(source): if name in self.COPY_FIELDS: metadata[name] = value info = OrderedDict([ ('software', 'Webrecorder Platform v' + __version__), ('format', 'WARC File Format 1.0'), ('creator', creator), ('isPartOf', title), ('json-metadata', json.dumps(metadata)), ]) wi_writer = BufferWARCWriter() wi_writer.write_record(wi_writer.create_warcinfo_record(filename, info)) return wi_writer.get_contents()
def test_response_warc_1_1(self, is_gzip, builder_factory): writer = BufferWARCWriter(gzip=is_gzip, warc_version='WARC/1.1') builder = builder_factory(writer, warc_version='WARC/1.1') resp = sample_response(builder) writer.write_record(resp) stream = writer.get_stream() reader = ArchiveIterator(stream) recs = list(reader) assert len(recs) == 1 assert recs[0].rec_headers.protocol == 'WARC/1.1' # ISO 8601 date with fractional seconds (microseconds) assert '.' in recs[0].rec_headers['WARC-Date'] assert len(recs[0].rec_headers['WARC-Date']) == 27
def create_warcinfo(self, creator, name, metadata, source, serialized, filename): for key, value in iteritems(serialized): if key in self.COPY_FIELDS: metadata[key] = value if not metadata.get('title'): metadata['title'] = self.DEFAULT_REC_TITLE.format(source.to_iso_date(metadata['created_at'], no_T=True)) metadata['auto_title'] = True info = OrderedDict([ ('software', 'Webrecorder Platform v' + __version__), ('format', 'WARC File Format 1.0'), ('creator', creator.name), ('isPartOf', name), ('json-metadata', json.dumps(metadata)), ]) wi_writer = BufferWARCWriter() wi_writer.write_record(wi_writer.create_warcinfo_record(filename, info)) return wi_writer.get_contents()
def test_utf8_rewrite_content_adjust(self): UTF8_PAYLOAD = u'\ HTTP/1.0 200 OK\r\n\ Content-Type: text/plain; charset="UTF-8"\r\n\ Content-Disposition: attachment; filename="испытание.txt"\r\n\ Custom-Header: somevalue\r\n\ Unicode-Header: %F0%9F%93%81%20text%20%F0%9F%97%84%EF%B8%8F\r\n\ \r\n\ some\n\ text' content_length = len(UTF8_PAYLOAD.encode('utf-8')) UTF8_RECORD = u'\ WARC/1.0\r\n\ WARC-Type: response\r\n\ WARC-Record-ID: <urn:uuid:12345678-feb0-11e6-8f83-68a86d1772ce>\r\n\ WARC-Target-URI: http://example.com/\r\n\ WARC-Date: 2000-01-01T00:00:00Z\r\n\ WARC-Payload-Digest: sha1:B6QJ6BNJ3R4B23XXMRKZKHLPGJY2VE4O\r\n\ WARC-Block-Digest: sha1:KMUABC6URWIQ7QXCZDQ5FS6WIBBFRORR\r\n\ Content-Type: application/http; msgtype=response\r\n\ Content-Length: {0}\r\n\ \r\n\ {1}\r\n\ \r\n\ '.format(content_length, UTF8_PAYLOAD) assert (content_length == 226) record = ArcWarcRecordLoader().parse_record_stream( BytesIO(UTF8_RECORD.encode('utf-8'))) writer = BufferWARCWriter(gzip=False) writer.write_record(record) raw_buff = writer.get_contents() assert raw_buff.decode('utf-8') == RESPONSE_RECORD_UNICODE_HEADERS for record in ArchiveIterator(writer.get_stream()): assert record.length == 268
def test_identity(self): """ read(write(record)) should yield record """ payload = b'foobar' writer = BufferWARCWriter(gzip=True) httpHeaders = StatusAndHeaders('GET / HTTP/1.1', {}, is_http_request=True) warcHeaders = {'Foo': 'Bar'} record = writer.create_warc_record('http://example.com/', 'request', payload=BytesIO(payload), warc_headers_dict=warcHeaders, http_headers=httpHeaders) writer.write_record(record) for new_rec in ArchiveIterator(writer.get_stream()): assert new_rec.rec_type == record.rec_type assert new_rec.rec_headers == record.rec_headers assert new_rec.content_type == record.content_type assert new_rec.length == record.length assert new_rec.http_headers == record.http_headers assert new_rec.raw_stream.read() == payload
def create_warcinfo(self, creator, name, metadata, source, serialized, filename): for key, value in iteritems(serialized): if key in self.COPY_FIELDS: metadata[key] = value if not metadata.get('title'): metadata['title'] = self.DEFAULT_REC_TITLE.format( source.to_iso_date(metadata['created_at'], no_T=True)) metadata['auto_title'] = True info = OrderedDict([ ('software', 'Webrecorder Platform v' + __version__), ('format', 'WARC File Format 1.0'), ('creator', creator.name), ('isPartOf', name), ('json-metadata', json.dumps(metadata)), ]) wi_writer = BufferWARCWriter() wi_writer.write_record(wi_writer.create_warcinfo_record( filename, info)) return wi_writer.get_contents()