def make_detailed_warcinfo(filename, guid, coll_title, coll_desc, rec_title, pages): # # # Thank you! Rhizome/Webrecorder.io/Ilya Kreymer # # coll_metadata = {'type': 'collection', 'title': coll_title, 'desc': coll_desc} rec_metadata = {'type': 'recording', 'title': rec_title, 'pages': pages} # Coll info writer = BufferWARCWriter(gzip=True) params = OrderedDict([('operator', 'Perma.cc download'), ('Perma-GUID', guid), ('format', 'WARC File Format 1.0'), ('json-metadata', json.dumps(coll_metadata))]) record = writer.create_warcinfo_record(filename, params) writer.write_record(record) # Rec Info params['json-metadata'] = json.dumps(rec_metadata) record = writer.create_warcinfo_record(filename, params) writer.write_record(record) return writer.get_contents()
def test_skip_filter(self): warc_writer = BufferWARCWriter(gzip=False) def skip_filter(request, response, warc_writer): assert request assert response return None, None with capture_http(warc_writer, skip_filter): res = requests.get('http://localhost:{0}/get?foo=bar'.format(self.port), headers={'Host': 'httpbin.org'}) assert res.json()['args'] == {'foo': 'bar'} # skipped, nothing written assert warc_writer.get_contents() == b''
def create_warcinfo(self, creator, title, metadata, source, filename): for name, value in iteritems(source): if name in self.COPY_FIELDS: metadata[name] = value info = OrderedDict([ ('software', 'Webrecorder Platform v' + __version__), ('format', 'WARC File Format 1.0'), ('creator', creator), ('isPartOf', title), ('json-metadata', json.dumps(metadata)), ]) wi_writer = BufferWARCWriter() wi_writer.write_record(wi_writer.create_warcinfo_record(filename, info)) return wi_writer.get_contents()
def create_warcinfo(self, creator, name, metadata, source, serialized, filename): for key, value in iteritems(serialized): if key in self.COPY_FIELDS: metadata[key] = value if not metadata.get('title'): metadata['title'] = self.DEFAULT_REC_TITLE.format(source.to_iso_date(metadata['created_at'], no_T=True)) metadata['auto_title'] = True info = OrderedDict([ ('software', 'Webrecorder Platform v' + __version__), ('format', 'WARC File Format 1.0'), ('creator', creator.name), ('isPartOf', name), ('json-metadata', json.dumps(metadata)), ]) wi_writer = BufferWARCWriter() wi_writer.write_record(wi_writer.create_warcinfo_record(filename, info)) return wi_writer.get_contents()
def test_utf8_rewrite_content_adjust(self): UTF8_PAYLOAD = u'\ HTTP/1.0 200 OK\r\n\ Content-Type: text/plain; charset="UTF-8"\r\n\ Content-Disposition: attachment; filename="испытание.txt"\r\n\ Custom-Header: somevalue\r\n\ Unicode-Header: %F0%9F%93%81%20text%20%F0%9F%97%84%EF%B8%8F\r\n\ \r\n\ some\n\ text' content_length = len(UTF8_PAYLOAD.encode('utf-8')) UTF8_RECORD = u'\ WARC/1.0\r\n\ WARC-Type: response\r\n\ WARC-Record-ID: <urn:uuid:12345678-feb0-11e6-8f83-68a86d1772ce>\r\n\ WARC-Target-URI: http://example.com/\r\n\ WARC-Date: 2000-01-01T00:00:00Z\r\n\ WARC-Payload-Digest: sha1:B6QJ6BNJ3R4B23XXMRKZKHLPGJY2VE4O\r\n\ WARC-Block-Digest: sha1:KMUABC6URWIQ7QXCZDQ5FS6WIBBFRORR\r\n\ Content-Type: application/http; msgtype=response\r\n\ Content-Length: {0}\r\n\ \r\n\ {1}\r\n\ \r\n\ '.format(content_length, UTF8_PAYLOAD) assert (content_length == 226) record = ArcWarcRecordLoader().parse_record_stream( BytesIO(UTF8_RECORD.encode('utf-8'))) writer = BufferWARCWriter(gzip=False) writer.write_record(record) raw_buff = writer.get_contents() assert raw_buff.decode('utf-8') == RESPONSE_RECORD_UNICODE_HEADERS for record in ArchiveIterator(writer.get_stream()): assert record.length == 268
def create_warcinfo(self, creator, name, metadata, source, serialized, filename): for key, value in iteritems(serialized): if key in self.COPY_FIELDS: metadata[key] = value if not metadata.get('title'): metadata['title'] = self.DEFAULT_REC_TITLE.format( source.to_iso_date(metadata['created_at'], no_T=True)) metadata['auto_title'] = True info = OrderedDict([ ('software', 'Webrecorder Platform v' + __version__), ('format', 'WARC File Format 1.0'), ('creator', creator.name), ('isPartOf', name), ('json-metadata', json.dumps(metadata)), ]) wi_writer = BufferWARCWriter() wi_writer.write_record(wi_writer.create_warcinfo_record( filename, info)) return wi_writer.get_contents()