示例#1
0
文件: utils.py 项目: yingziwu/perma
def make_detailed_warcinfo(filename, guid, coll_title, coll_desc, rec_title, pages):
    # #
    # Thank you! Rhizome/Webrecorder.io/Ilya Kreymer
    # #

    coll_metadata = {'type': 'collection',
                     'title': coll_title,
                     'desc': coll_desc}

    rec_metadata = {'type': 'recording',
                    'title': rec_title,
                    'pages': pages}

    # Coll info
    writer = BufferWARCWriter(gzip=True)
    params = OrderedDict([('operator', 'Perma.cc download'),
                          ('Perma-GUID', guid),
                          ('format', 'WARC File Format 1.0'),
                          ('json-metadata', json.dumps(coll_metadata))])

    record = writer.create_warcinfo_record(filename, params)
    writer.write_record(record)

    # Rec Info
    params['json-metadata'] = json.dumps(rec_metadata)

    record = writer.create_warcinfo_record(filename, params)
    writer.write_record(record)

    return writer.get_contents()
示例#2
0
    def create_warcinfo(self, creator, title, metadata, source, filename):
        for name, value in iteritems(source):
            if name in self.COPY_FIELDS:
                metadata[name] = value

        info = OrderedDict([
                ('software', 'Webrecorder Platform v' + __version__),
                ('format', 'WARC File Format 1.0'),
                ('creator', creator),
                ('isPartOf', title),
                ('json-metadata', json.dumps(metadata)),
               ])

        wi_writer = BufferWARCWriter()
        wi_writer.write_record(wi_writer.create_warcinfo_record(filename, info))
        return wi_writer.get_contents()
示例#3
0
    def test_response_warc_1_1(self, is_gzip, builder_factory):
        writer = BufferWARCWriter(gzip=is_gzip, warc_version='WARC/1.1')

        builder = builder_factory(writer, warc_version='WARC/1.1')
        resp = sample_response(builder)

        writer.write_record(resp)

        stream = writer.get_stream()

        reader = ArchiveIterator(stream)
        recs = list(reader)

        assert len(recs) == 1
        assert recs[0].rec_headers.protocol == 'WARC/1.1'

        # ISO 8601 date with fractional seconds (microseconds)
        assert '.' in recs[0].rec_headers['WARC-Date']
        assert len(recs[0].rec_headers['WARC-Date']) == 27
    def create_warcinfo(self, creator, name, metadata, source, serialized, filename):
        for key, value in iteritems(serialized):
            if key in self.COPY_FIELDS:
                metadata[key] = value

        if not metadata.get('title'):
            metadata['title'] = self.DEFAULT_REC_TITLE.format(source.to_iso_date(metadata['created_at'], no_T=True))
            metadata['auto_title'] = True

        info = OrderedDict([
                ('software', 'Webrecorder Platform v' + __version__),
                ('format', 'WARC File Format 1.0'),
                ('creator', creator.name),
                ('isPartOf', name),
                ('json-metadata', json.dumps(metadata)),
               ])

        wi_writer = BufferWARCWriter()
        wi_writer.write_record(wi_writer.create_warcinfo_record(filename, info))
        return wi_writer.get_contents()
示例#5
0
    def test_utf8_rewrite_content_adjust(self):
        UTF8_PAYLOAD = u'\
HTTP/1.0 200 OK\r\n\
Content-Type: text/plain; charset="UTF-8"\r\n\
Content-Disposition: attachment; filename="испытание.txt"\r\n\
Custom-Header: somevalue\r\n\
Unicode-Header: %F0%9F%93%81%20text%20%F0%9F%97%84%EF%B8%8F\r\n\
\r\n\
some\n\
text'

        content_length = len(UTF8_PAYLOAD.encode('utf-8'))

        UTF8_RECORD = u'\
WARC/1.0\r\n\
WARC-Type: response\r\n\
WARC-Record-ID: <urn:uuid:12345678-feb0-11e6-8f83-68a86d1772ce>\r\n\
WARC-Target-URI: http://example.com/\r\n\
WARC-Date: 2000-01-01T00:00:00Z\r\n\
WARC-Payload-Digest: sha1:B6QJ6BNJ3R4B23XXMRKZKHLPGJY2VE4O\r\n\
WARC-Block-Digest: sha1:KMUABC6URWIQ7QXCZDQ5FS6WIBBFRORR\r\n\
Content-Type: application/http; msgtype=response\r\n\
Content-Length: {0}\r\n\
\r\n\
{1}\r\n\
\r\n\
'.format(content_length, UTF8_PAYLOAD)

        assert (content_length == 226)

        record = ArcWarcRecordLoader().parse_record_stream(
            BytesIO(UTF8_RECORD.encode('utf-8')))

        writer = BufferWARCWriter(gzip=False)
        writer.write_record(record)

        raw_buff = writer.get_contents()
        assert raw_buff.decode('utf-8') == RESPONSE_RECORD_UNICODE_HEADERS

        for record in ArchiveIterator(writer.get_stream()):
            assert record.length == 268
示例#6
0
    def test_identity(self):
        """ read(write(record)) should yield record """
        payload = b'foobar'
        writer = BufferWARCWriter(gzip=True)
        httpHeaders = StatusAndHeaders('GET / HTTP/1.1', {},
                                       is_http_request=True)
        warcHeaders = {'Foo': 'Bar'}
        record = writer.create_warc_record('http://example.com/',
                                           'request',
                                           payload=BytesIO(payload),
                                           warc_headers_dict=warcHeaders,
                                           http_headers=httpHeaders)

        writer.write_record(record)

        for new_rec in ArchiveIterator(writer.get_stream()):
            assert new_rec.rec_type == record.rec_type
            assert new_rec.rec_headers == record.rec_headers
            assert new_rec.content_type == record.content_type
            assert new_rec.length == record.length
            assert new_rec.http_headers == record.http_headers
            assert new_rec.raw_stream.read() == payload
示例#7
0
    def create_warcinfo(self, creator, name, metadata, source, serialized,
                        filename):
        for key, value in iteritems(serialized):
            if key in self.COPY_FIELDS:
                metadata[key] = value

        if not metadata.get('title'):
            metadata['title'] = self.DEFAULT_REC_TITLE.format(
                source.to_iso_date(metadata['created_at'], no_T=True))
            metadata['auto_title'] = True

        info = OrderedDict([
            ('software', 'Webrecorder Platform v' + __version__),
            ('format', 'WARC File Format 1.0'),
            ('creator', creator.name),
            ('isPartOf', name),
            ('json-metadata', json.dumps(metadata)),
        ])

        wi_writer = BufferWARCWriter()
        wi_writer.write_record(wi_writer.create_warcinfo_record(
            filename, info))
        return wi_writer.get_contents()