Exemplo n.º 1
0
    def test_record_param_user_coll_skip(self):
        warc_path = to_path(self.root_dir + '/warcs/{user}/{coll}/')

        dedup_index = self._get_dedup_index(dupe_policy=SkipDupePolicy())

        recorder_app = RecorderApp(
            self.upstream_url,
            PerRecordWARCWriter(warc_path, dedup_index=dedup_index))

        # No new entries written
        self._test_all_warcs('/warcs/USER/COLL/', 2)

        resp = self._test_warc_write(
            recorder_app, 'httpbin.org', '/user-agent',
            '&param.recorder.user=USER&param.recorder.coll=COLL')

        assert '"user-agent": "{0}"'.format(UA) in resp.text
        #assert b'HTTP/1.1 200 OK' in resp.body
        #assert b'"foo": "bar"' in resp.body
        self._test_all_warcs('/warcs/USER/COLL/', 2)

        # Test Redis CDX
        r = FakeStrictRedis.from_url('redis://localhost/2')

        res = r.zrangebylex('USER:COLL:cdxj', '[org,httpbin)/',
                            '(org,httpbin,')
        assert len(res) == 2
Exemplo n.º 2
0
    def test_record_param_user_coll_write_dupe_no_revisit(self):
        warc_path = to_path(self.root_dir + '/warcs/{user}/{coll}/')

        dedup_index = self._get_dedup_index(dupe_policy=WriteDupePolicy())

        writer = PerRecordWARCWriter(warc_path, dedup_index=dedup_index)
        recorder_app = RecorderApp(self.upstream_url, writer)

        resp = self._test_warc_write(
            recorder_app, 'httpbin.org', '/get?foo=bar',
            '&param.recorder.user=USER&param.recorder.coll=COLL')
        assert b'HTTP/1.1 200 OK' in resp.body
        assert b'"foo": "bar"' in resp.body

        self._test_all_warcs('/warcs/USER/COLL/', 3)

        r = FakeStrictRedis.from_url('redis://localhost/2')

        res = r.zrangebylex('USER:COLL:cdxj', '[org,httpbin)/',
                            '(org,httpbin,')
        assert len(res) == 3

        mimes = [CDXObject(x)['mime'] for x in res]

        assert sorted(mimes) == [
            'application/json', 'application/json', 'warc/revisit'
        ]

        assert len(writer.fh_cache) == 0
Exemplo n.º 3
0
    def test_record_skip_http_only_cookies_header(self):
        warc_path = to_path(self.root_dir + '/warcs/cookieskip_httponly/')
        header_filter = ExcludeHttpOnlyCookieHeaders()
        recorder_app = RecorderApp(self.upstream_url,
                                   PerRecordWARCWriter(
                                       warc_path, header_filter=header_filter),
                                   accept_colls='live')

        resp = self._test_warc_write(recorder_app, 'www.google.com', '/')
        assert b'HTTP/1.1 302' in resp.body

        buff = BytesIO(resp.body)
        record = ArcWarcRecordLoader().parse_record_stream(buff)

        non_http_only, http_only = self._get_http_only_cookies(record)
        # both httponly and other cookies
        assert http_only != None
        assert non_http_only != None

        stored_req, stored_resp = self._load_resp_req(warc_path)

        non_http_only, http_only = self._get_http_only_cookies(stored_resp)
        # no httponly cookies
        assert http_only == None
        assert non_http_only != None

        assert ('X-Other', 'foo') in stored_req.http_headers.headers
        assert ('Cookie', 'boo=far') not in stored_req.http_headers.headers

        self._test_all_warcs('/warcs/cookieskip_httponly/', 1)
Exemplo n.º 4
0
    def test_record_video_metadata(self):
        pytest.importorskip('youtube_dl')
        warc_path = to_path(self.root_dir + '/warcs/{user}/{coll}/')

        dedup_index = self._get_dedup_index()

        writer = PerRecordWARCWriter(warc_path, dedup_index=dedup_index)
        recorder_app = RecorderApp(self.upstream_url, writer)

        params = {'param.recorder.user': '******',
                  'param.recorder.coll': 'VIDEO',
                  'content_type': 'application/vnd.youtube-dl_formats+json'
                 }

        resp = self._test_warc_write(recorder_app,
            'www.youtube.com', '/v/BfBgWtAIbRc', '&' + urlencode(params),
            link_url='metadata://www.youtube.com/v/BfBgWtAIbRc')

        r = FakeStrictRedis.from_url('redis://localhost/2')

        warcs = r.hgetall('USER:VIDEO:warc')
        assert len(warcs) == 1

        filename = list(warcs.values())[0]

        with open(filename, 'rb') as fh:
            decomp = DecompressingBufferedReader(fh)
            record = ArcWarcRecordLoader().parse_record_stream(decomp)

        status_headers = record.rec_headers
        assert status_headers.get_header('WARC-Type') == 'metadata'
        assert status_headers.get_header('Content-Type') == 'application/vnd.youtube-dl_formats+json'
        assert status_headers.get_header('WARC-Block-Digest') != ''
        assert status_headers.get_header('WARC-Block-Digest') == status_headers.get_header('WARC-Payload-Digest')
Exemplo n.º 5
0
    def test_record_skip_all_cookies_header(self):
        warc_path = to_path(self.root_dir + '/warcs/cookieskip/')
        header_filter = ExcludeSpecificHeaders(['Set-Cookie', 'Cookie'])
        recorder_app = RecorderApp(self.upstream_url,
                                   PerRecordWARCWriter(
                                       warc_path, header_filter=header_filter),
                                   accept_colls='live')

        resp = self._test_warc_write(
            recorder_app, 'httpbin.org',
            '/cookies/set%3Fname%3Dvalue%26foo%3Dbar')
        assert b'HTTP/1.1 302' in resp.body

        buff = BytesIO(resp.body)
        record = ArcWarcRecordLoader().parse_record_stream(buff)
        assert ('Set-Cookie',
                'name=value; Path=/') in record.http_headers.headers
        assert ('Set-Cookie', 'foo=bar; Path=/') in record.http_headers.headers

        stored_req, stored_resp = self._load_resp_req(warc_path)

        assert ('Set-Cookie',
                'name=value; Path=/') not in stored_resp.http_headers.headers
        assert ('Set-Cookie',
                'foo=bar; Path=/') not in stored_resp.http_headers.headers

        assert ('X-Other', 'foo') in stored_req.http_headers.headers
        assert ('Cookie', 'boo=far') not in stored_req.http_headers.headers

        self._test_all_warcs('/warcs/cookieskip/', 1)
Exemplo n.º 6
0
    def test_record_param_user_coll(self):
        warc_path = to_path(self.root_dir + '/warcs/{user}/{coll}/')

        dedup_index = self._get_dedup_index()

        recorder_app = RecorderApp(self.upstream_url,
                        PerRecordWARCWriter(warc_path, dedup_index=dedup_index))

        self._test_all_warcs('/warcs/USER/COLL/', None)

        resp = self._test_warc_write(recorder_app, 'httpbin.org', '/user-agent',
                            '&param.recorder.user=USER&param.recorder.coll=COLL')

        assert '"user-agent": "{0}"'.format(UA) in resp.text
        #assert b'HTTP/1.1 200 OK' in resp.body
        #assert b'"foo": "bar"' in resp.body

        self._test_all_warcs('/warcs/USER/COLL/', 1)

        r = FakeStrictRedis.from_url('redis://localhost/2')

        res = r.zrangebylex('USER:COLL:cdxj', '[org,httpbin)/', '(org,httpbin,')
        assert len(res) == 1

        cdx = CDXObject(res[0])
        assert cdx['urlkey'] == 'org,httpbin)/user-agent'
        assert cdx['mime'] == 'application/json'
        assert cdx['offset'] == '0'
        assert cdx['filename'].startswith(to_path('USER/COLL/'))
        assert cdx['filename'].endswith('.warc.gz')

        warcs = r.hgetall('USER:COLL:warc')
        full_path = to_path(self.root_dir + '/warcs/' + cdx['filename'])
        assert warcs == {cdx['filename'].encode('utf-8'): full_path.encode('utf-8')}
Exemplo n.º 7
0
    def test_record_param_user_coll_revisit(self):
        warc_path = to_path(self.root_dir + '/warcs/{user}/{coll}/')

        dedup_index = self._get_dedup_index()

        recorder_app = RecorderApp(
            self.upstream_url,
            PerRecordWARCWriter(warc_path, dedup_index=dedup_index))

        self._test_all_warcs('/warcs/USER/COLL/', 1)

        resp = self._test_warc_write(
            recorder_app, 'httpbin.org', '/user-agent',
            '&param.recorder.user=USER&param.recorder.coll=COLL')

        assert '"user-agent": "{0}"'.format(UA) in resp.text
        #assert b'HTTP/1.1 200 OK' in resp.body
        #assert b'"foo": "bar"' in resp.body

        self._test_all_warcs('/warcs/USER/COLL/', 2)

        # Test Redis CDX
        r = FakeStrictRedis.from_url('redis://localhost/2')

        res = r.zrangebylex('USER:COLL:cdxj', '[org,httpbin)/',
                            '(org,httpbin,')
        assert len(res) == 2

        if b'warc/revisit' in res[0]:
            cdx = CDXObject(res[0])
        else:
            cdx = CDXObject(res[1])

        assert cdx['urlkey'] == 'org,httpbin)/user-agent'
        assert cdx['mime'] == 'warc/revisit'
        assert cdx['offset'] == '0'
        assert cdx['filename'].startswith(to_path('USER/COLL/'))
        assert cdx['filename'].endswith('.warc.gz')

        fullwarc = os.path.join(self.root_dir, 'warcs', cdx['filename'])

        warcs = r.hgetall('USER:COLL:warc')
        assert len(warcs) == 2
        assert warcs[cdx['filename'].encode('utf-8')] == fullwarc.encode(
            'utf-8')

        with open(fullwarc, 'rb') as fh:
            decomp = DecompressingBufferedReader(fh)
            # Test refers-to headers
            status_headers = StatusAndHeadersParser(['WARC/1.0']).parse(decomp)
            assert status_headers.get_header('WARC-Type') == 'revisit'
            assert status_headers.get_header(
                'WARC-Target-URI') == 'http://httpbin.org/user-agent'
            assert status_headers.get_header('WARC-Date') != ''
            assert status_headers.get_header(
                'WARC-Refers-To-Target-URI') == 'http://httpbin.org/user-agent'
            assert status_headers.get_header('WARC-Refers-To-Date') != ''
Exemplo n.º 8
0
    def test_record_skip_wrong_coll(self):
        recorder_app = RecorderApp(self.upstream_url,
                        writer=PerRecordWARCWriter(to_path(self.root_dir + '/warcs/')), accept_colls='not-live')

        resp = self._test_warc_write(recorder_app, 'httpbin.org', '/get?foo=bar')
        assert b'HTTP/1.1 200 OK' in resp.body
        assert b'"foo": "bar"' in resp.body

        self._test_all_warcs('/warcs/', 2)
Exemplo n.º 9
0
    def test_record_warc_1(self):
        recorder_app = RecorderApp(self.upstream_url,
                        PerRecordWARCWriter(to_path(self.root_dir + '/warcs/')))

        resp = self._test_warc_write(recorder_app, 'httpbin.org', '/get?foo=bar')
        assert b'HTTP/1.1 200 OK' in resp.body
        assert b'"foo": "bar"' in resp.body

        self._test_all_warcs('/warcs/', 1)
Exemplo n.º 10
0
    def test_error_url(self):
        recorder_app = RecorderApp(self.upstream_url + '01',
                        PerRecordWARCWriter(to_path(self.root_dir + '/warcs/')), accept_colls='live')


        testapp = webtest.TestApp(recorder_app)
        resp = testapp.get('/live/resource?url=http://example.com/', status=400)

        assert resp.json['error'] != ''

        self._test_all_warcs('/warcs/', 2)
Exemplo n.º 11
0
    def test_record_param_user_coll_same_dir(self):
        warc_path = to_path(self.root_dir + '/warcs2/')

        dedup_index = self._get_dedup_index()

        recorder_app = RecorderApp(self.upstream_url,
                        PerRecordWARCWriter(warc_path, dedup_index=dedup_index, key_template='{user}:{coll}'))

        resp = self._test_warc_write(recorder_app, 'httpbin.org',
                            '/get?foo=bar', '&param.recorder.user=USER2&param.recorder.coll=COLL2')
        assert b'HTTP/1.1 200 OK' in resp.body
        assert b'"foo": "bar"' in resp.body

        resp = self._test_warc_write(recorder_app, 'httpbin.org',
                            '/get?foo=bar', '&param.recorder.user=USER2&param.recorder.coll=COLL3')
        assert b'HTTP/1.1 200 OK' in resp.body
        assert b'"foo": "bar"' in resp.body

        self._test_all_warcs('/warcs2', 2)