示例#1
0
    def test_post_stream(self):
        warc_writer = BufferWARCWriter(gzip=False)

        def nop_filter(request, response, warc_writer):
            assert request
            assert response
            return request, response

        postbuff = BytesIO(b'somedatatopost')

        url = 'http://localhost:{0}/post'.format(self.port)

        with capture_http(warc_writer, nop_filter):
            res = requests.post(url, data=postbuff)

        # response
        ai = ArchiveIterator(warc_writer.get_stream())
        response = next(ai)
        assert response.rec_type == 'response'
        assert response.rec_headers['WARC-Target-URI'] == url
        assert response.rec_headers['WARC-IP-Address'] == '127.0.0.1'

        assert res.json() == json.loads(response.content_stream().read().decode('utf-8'))

        # request
        request = next(ai)
        assert request.rec_type == 'request'
        assert request.rec_headers['WARC-Target-URI'] == url
        assert request.rec_headers['WARC-IP-Address'] == '127.0.0.1'

        data = request.content_stream().read().decode('utf-8')
        assert data == 'somedatatopost'
示例#2
0
    def test_post_json(self):
        warc_writer = BufferWARCWriter(gzip=False)

        with capture_http(warc_writer):
            res = requests.post('http://localhost:{0}/post'.format(self.port),
                                headers={'Host': 'httpbin.org'},
                                json={'some': {
                                    'data': 'posted'
                                }})

        assert res.json()['json'] == {'some': {'data': 'posted'}}

        # response
        ai = ArchiveIterator(warc_writer.get_stream())
        response = next(ai)
        assert response.rec_type == 'response'

        assert res.json() == json.loads(
            response.content_stream().read().decode('utf-8'))

        # request
        request = next(ai)
        assert request.rec_type == 'request'
        assert request.http_headers['Content-Type'] == 'application/json'

        data = request.content_stream().read().decode('utf-8')
        assert data == '{"some": {"data": "posted"}}'
示例#3
0
def test_warc_enabled():
    # Note that this does not use the responses mocking framework, as it conflicts with the warc captures.
    # This makes a real request to Yahoo, so might fail.
    url = 'https://groups.yahoo.com/api/v1/groups/test/'

    yga = YahooGroupsAPI('test')
    writer = BufferWARCWriter(gzip=False)
    yga.set_warc_writer(writer)
    yga.get_json('HackGroupInfo')

    expected = [(url, 'response'), (url, 'request')]
    actual = [(record.rec_headers['WARC-Target-URI'], record.rec_type)
              for record in ArchiveIterator(writer.get_stream())]
    assert expected == actual
示例#4
0
    def test_response_warc_1_1(self, is_gzip, builder_factory):
        writer = BufferWARCWriter(gzip=is_gzip, warc_version='WARC/1.1')

        builder = builder_factory(writer, warc_version='WARC/1.1')
        resp = sample_response(builder)

        writer.write_record(resp)

        stream = writer.get_stream()

        reader = ArchiveIterator(stream)
        recs = list(reader)

        assert len(recs) == 1
        assert recs[0].rec_headers.protocol == 'WARC/1.1'

        # ISO 8601 date with fractional seconds (microseconds)
        assert '.' in recs[0].rec_headers['WARC-Date']
        assert len(recs[0].rec_headers['WARC-Date']) == 27
示例#5
0
    def test_request_response_concur(self):
        writer = BufferWARCWriter(gzip=False)

        resp = self._sample_response(writer)

        req = self._sample_request(writer)

        writer.write_request_response_pair(req, resp)

        stream = writer.get_stream()

        reader = ArchiveIterator(stream)
        resp, req = list(reader)

        resp_id = resp.rec_headers.get_header('WARC-Record-ID')
        req_id = req.rec_headers.get_header('WARC-Record-ID')

        assert resp_id != req_id
        assert resp_id == req.rec_headers.get_header('WARC-Concurrent-To')
示例#6
0
    def test_get_cache_to_file(self):
        warc_writer = BufferWARCWriter(gzip=False)

        url = 'http://localhost:{0}/bytes/{1}'.format(self.port, BUFF_SIZE * 2)
        with capture_http(warc_writer):
            res = requests.get(url, headers={'Host': 'httpbin.org'})

        assert len(res.content) == BUFF_SIZE * 2

        ai = ArchiveIterator(warc_writer.get_stream())
        response = next(ai)
        assert response.rec_type == 'response'
        assert response.rec_headers['WARC-Target-URI'] == url
        assert response.rec_headers['WARC-IP-Address'] == '127.0.0.1'
        assert res.content == response.content_stream().read()

        request = next(ai)
        assert request.rec_type == 'request'
        assert request.rec_headers['WARC-Target-URI'] == url
        assert request.rec_headers['WARC-IP-Address'] == '127.0.0.1'
示例#7
0
    def test_request_response_concur(self, is_gzip, builder_factory):
        writer = BufferWARCWriter(gzip=is_gzip)
        builder = builder_factory(writer, builder_cls=RecordBuilder)

        resp = sample_response(builder)

        req = sample_request(builder)

        writer.write_request_response_pair(req, resp)

        stream = writer.get_stream()

        reader = ArchiveIterator(stream)
        resp, req = list(reader)

        resp_id = resp.rec_headers.get_header('WARC-Record-ID')
        req_id = req.rec_headers.get_header('WARC-Record-ID')

        assert resp_id != req_id
        assert resp_id == req.rec_headers.get_header('WARC-Concurrent-To')
示例#8
0
    def test_utf8_rewrite_content_adjust(self):
        UTF8_PAYLOAD = u'\
HTTP/1.0 200 OK\r\n\
Content-Type: text/plain; charset="UTF-8"\r\n\
Content-Disposition: attachment; filename="испытание.txt"\r\n\
Custom-Header: somevalue\r\n\
Unicode-Header: %F0%9F%93%81%20text%20%F0%9F%97%84%EF%B8%8F\r\n\
\r\n\
some\n\
text'

        content_length = len(UTF8_PAYLOAD.encode('utf-8'))

        UTF8_RECORD = u'\
WARC/1.0\r\n\
WARC-Type: response\r\n\
WARC-Record-ID: <urn:uuid:12345678-feb0-11e6-8f83-68a86d1772ce>\r\n\
WARC-Target-URI: http://example.com/\r\n\
WARC-Date: 2000-01-01T00:00:00Z\r\n\
WARC-Payload-Digest: sha1:B6QJ6BNJ3R4B23XXMRKZKHLPGJY2VE4O\r\n\
WARC-Block-Digest: sha1:KMUABC6URWIQ7QXCZDQ5FS6WIBBFRORR\r\n\
Content-Type: application/http; msgtype=response\r\n\
Content-Length: {0}\r\n\
\r\n\
{1}\r\n\
\r\n\
'.format(content_length, UTF8_PAYLOAD)

        assert (content_length == 226)

        record = ArcWarcRecordLoader().parse_record_stream(
            BytesIO(UTF8_RECORD.encode('utf-8')))

        writer = BufferWARCWriter(gzip=False)
        writer.write_record(record)

        raw_buff = writer.get_contents()
        assert raw_buff.decode('utf-8') == RESPONSE_RECORD_UNICODE_HEADERS

        for record in ArchiveIterator(writer.get_stream()):
            assert record.length == 268
示例#9
0
    def test_identity(self):
        """ read(write(record)) should yield record """
        payload = b'foobar'
        writer = BufferWARCWriter(gzip=True)
        httpHeaders = StatusAndHeaders('GET / HTTP/1.1', {},
                                       is_http_request=True)
        warcHeaders = {'Foo': 'Bar'}
        record = writer.create_warc_record('http://example.com/',
                                           'request',
                                           payload=BytesIO(payload),
                                           warc_headers_dict=warcHeaders,
                                           http_headers=httpHeaders)

        writer.write_record(record)

        for new_rec in ArchiveIterator(writer.get_stream()):
            assert new_rec.rec_type == record.rec_type
            assert new_rec.rec_headers == record.rec_headers
            assert new_rec.content_type == record.content_type
            assert new_rec.length == record.length
            assert new_rec.http_headers == record.http_headers
            assert new_rec.raw_stream.read() == payload
示例#10
0
    def test_request_response_concur(self, is_gzip, builder_factory):
        writer = BufferWARCWriter(gzip=is_gzip)
        builder = builder_factory(writer, builder_cls=RecordBuilder)

        resp = sample_response(builder)

        req = sample_request(builder)

        # test explicitly calling ensure_digest with block digest enabled on a record
        writer.ensure_digest(resp, block=True, payload=True)

        writer.write_request_response_pair(req, resp)

        stream = writer.get_stream()

        reader = ArchiveIterator(stream)
        resp, req = list(reader)

        resp_id = resp.rec_headers.get_header('WARC-Record-ID')
        req_id = req.rec_headers.get_header('WARC-Record-ID')

        assert resp_id != req_id
        assert resp_id == req.rec_headers.get_header('WARC-Concurrent-To')
示例#11
0
    def test_post_chunked(self):
        warc_writer = BufferWARCWriter(gzip=False)

        def nop_filter(request, response, recorder):
            assert request
            assert response
            return request, response

        def gen():
            return iter([b'some', b'data', b'to', b'post'])

        #url = 'http://localhost:{0}/post'.format(self.port)
        url = 'https://httpbin.org/post'

        with capture_http(warc_writer, nop_filter, record_ip=False):
            res = requests.post(url,
                                data=gen(),
                                headers={'Content-Type': 'application/json'})

        # response
        ai = ArchiveIterator(warc_writer.get_stream())
        response = next(ai)
        assert response.rec_type == 'response'
        assert response.rec_headers['WARC-Target-URI'] == url
        assert 'WARC-IP-Address' not in response.rec_headers

        assert res.json() == json.loads(
            response.content_stream().read().decode('utf-8'))

        # request
        request = next(ai)
        assert request.rec_type == 'request'
        assert request.rec_headers['WARC-Target-URI'] == url
        assert 'WARC-IP-Address' not in response.rec_headers

        data = request.content_stream().read().decode('utf-8')
        assert data == 'somedatatopost'