Exemplo n.º 1
0
    def test_anon_download_coll(self):
        res = self._get_anon('/temp/$download')

        assert res.headers['Content-Disposition'].startswith("attachment; filename*=UTF-8''temp-")

        warcin = self._get_dechunked(res.body)

        cdxout = BytesIO()
        write_cdx_index(cdxout, warcin, 'temp.warc.gz', include_all=True, cdxj=True)

        cdx = [CDXObject(cdx) for cdx in cdxout.getvalue().rstrip().split(b'\n')]
        assert len(cdx) == 12

        # response
        cdx[0]['url'] = 'http://httpbin.org/get?food=bar'
        cdx[0]['mime'] = 'application/json'

        # request
        cdx[1]['url'] = 'http://httpbin.org/get?food=bar'
        cdx[1]['mime'] = '-'

        # response
        cdx[2]['url'] = 'http://httpbin.org/get?bood=far'
        cdx[2]['mime'] = 'application/json'

        # request
        cdx[3]['url'] = 'http://httpbin.org/get?bood=far'
        cdx[3]['mime'] = '-'

        # request
        cdx[4]['url'] = 'http://httpbin.org/get?boof=mar'
        cdx[4]['mime'] = '-'
Exemplo n.º 2
0
    def test_anon_download_coll(self):
        res = self._get_anon('/temp/$download')

        assert res.headers['Content-Disposition'].startswith(
            "attachment; filename*=UTF-8''temp-")

        warcin = self._get_dechunked(res.body)

        cdxout = BytesIO()
        write_cdx_index(cdxout,
                        warcin,
                        'temp.warc.gz',
                        include_all=True,
                        cdxj=True)

        cdx = [
            CDXObject(cdx) for cdx in cdxout.getvalue().rstrip().split(b'\n')
        ]
        assert len(cdx) == 10

        # response
        cdx[0]['url'] = 'http://httpbin.org/get?food=bar'
        cdx[0]['mime'] = 'application/json'

        # request
        cdx[1]['url'] = 'http://httpbin.org/get?food=bar'
        cdx[1]['mime'] = '-'

        # response
        cdx[2]['url'] = 'http://httpbin.org/get?bood=far'
        cdx[2]['mime'] = 'application/json'

        # request
        cdx[3]['url'] = 'http://httpbin.org/get?bood=far'
        cdx[3]['mime'] = '-'
Exemplo n.º 3
0
def cdx_index(warc, **options):
    buff = BytesIO()

    with open(TEST_WARC_DIR + warc, 'rb') as fh:
        write_cdx_index(buff, fh, warc, **options)

    return buff.getvalue()
Exemplo n.º 4
0
def test_cdxj_empty():
    options = dict(cdxj=True)

    buff = BytesIO()

    empty = BytesIO()

    write_cdx_index(buff, empty, 'empty.warc.gz', **options)

    assert buff.getvalue() == b''
    def _load_and_index(self, warc_path):

        cdx_path = self._conv_warc_to_cdx_path(warc_path)

        LOG.info('Indexing WARC: ' + warc_path)
        s3client = boto3.client('s3', config=self.boto_config)

        if self.options.skip_existing:
            try:
                s3client.head_object(Bucket=self.options.cdx_bucket,
                                          Key=cdx_path)
                LOG.info('Already Exists: ' + cdx_path)
                return
            except botocore.client.ClientError as exception:
                pass # ok, not found

        try:
            s3client.head_object(Bucket=self.options.warc_bucket,
                                      Key=warc_path)
        except botocore.client.ClientError as exception:
            LOG.error('WARC not found: ' + warc_path)
            return

        with TemporaryFile(mode='w+b',
                           dir=self.options.s3_local_temp_dir) as warctemp:
            LOG.info('Fetching WARC: ' + warc_path)
            try:
                s3client.download_fileobj(self.options.warc_bucket, warc_path, warctemp)
            except botocore.client.ClientError as exception:
                LOG.error('Failed to download {}: {}'.format(warc_path, exception))
                return

            warctemp.seek(0)
            LOG.info('Successfully fetched WARC: ' + warc_path)

            with TemporaryFile(mode='w+b',
                               dir=self.options.s3_local_temp_dir) as cdxtemp:
                with GzipFile(fileobj=cdxtemp, mode='w+b') as cdxfile:
                    # Index to temp
                    write_cdx_index(cdxfile, warctemp, warc_path, **self.index_options)

                # Upload temp
                cdxtemp.flush()
                cdxtemp.seek(0)
                LOG.info('Uploading CDX: ' + cdx_path)
                try:
                    s3client.upload_fileobj(cdxtemp, self.options.cdx_bucket, cdx_path)
                except botocore.client.ClientError as exception:
                    LOG.error('Failed to upload {}: {}'.format(cdx_path, exception))
                    return
                LOG.info('Successfully uploaded CDX: ' + cdx_path)
Exemplo n.º 6
0
    def _load_and_index(self, warc_path):

        cdx_path = self._conv_warc_to_cdx_path(warc_path)

        LOG.info('Indexing WARC: ' + warc_path)
        s3client = boto3.client('s3', config=self.boto_config)

        if self.options.skip_existing:
            try:
                s3client.head_object(Bucket=self.options.cdx_bucket,
                                          Key=cdx_path)
                LOG.info('Already Exists: ' + cdx_path)
                return
            except botocore.client.ClientError as exception:
                pass # ok, not found

        try:
            s3client.head_object(Bucket=self.options.warc_bucket,
                                      Key=warc_path)
        except botocore.client.ClientError as exception:
            LOG.error('WARC not found: ' + warc_path)
            return

        with TemporaryFile(mode='w+b',
                           dir=self.options.s3_local_temp_dir) as warctemp:
            LOG.info('Fetching WARC: ' + warc_path)
            try:
                s3client.download_fileobj(self.options.warc_bucket, warc_path, warctemp)
            except botocore.client.ClientError as exception:
                LOG.error('Failed to download {}: {}'.format(warc_path, exception))
                return

            warctemp.seek(0)
            LOG.info('Successfully fetched WARC: ' + warc_path)

            with TemporaryFile(mode='w+b',
                               dir=self.options.s3_local_temp_dir) as cdxtemp:
                with GzipFile(fileobj=cdxtemp, mode='w+b') as cdxfile:
                    # Index to temp
                    write_cdx_index(cdxfile, warctemp, warc_path, **self.index_options)

                # Upload temp
                cdxtemp.flush()
                cdxtemp.seek(0)
                LOG.info('Uploading CDX: ' + cdx_path)
                try:
                    s3client.upload_fileobj(cdxtemp, self.options.cdx_bucket, cdx_path)
                except botocore.client.ClientError as exception:
                    LOG.error('Failed to upload {}: {}'.format(cdx_path, exception))
                    return
                LOG.info('Successfully uploaded CDX: ' + cdx_path)
Exemplo n.º 7
0
    def add_urls_to_index(self, stream, params, filename, length):
        base_filename = self._get_rel_or_base_name(filename, params)

        cdxout = BytesIO()
        write_cdx_index(cdxout, stream, base_filename,
                        cdxj=True, append_post=True,
                        writer_cls=params.get('writer_cls'))

        z_key = res_template(self.redis_key_template, params)

        cdx_list = cdxout.getvalue().rstrip().split(b'\n')

        for cdx in cdx_list:
            if cdx:
                self.redis.zadd(z_key, 0, cdx)

        return cdx_list
Exemplo n.º 8
0
def test_multipart_form():
    test_data = b'\
WARC/1.0\r\n\
WARC-Type: response\r\n\
WARC-Record-ID: <urn:uuid:073fac44-c383-4a2b-980d-76fec83bd20d>\r\n\
WARC-Date: 2020-11-19T19:54:34Z\r\n\
WARC-Target-URI: https://example.com/ajax/bz?foo=bar\r\n\
Content-Type: application/http;msgtype=response\r\n\
WARC-Payload-Digest: sha1:3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ\r\n\
Content-Length: 48\r\n\
WARC-Block-Digest: sha1:XN45YTSBLG5PLJ4HA7DRDYGJBM5VW4UO\r\n\
\r\n\
Content-Type: text/html; charset="utf-8"\r\n\
\r\n\
ABCD\r\n\
\r\n\
\r\n\
\r\n\
WARC/1.0\r\n\
WARC-Type: request\r\n\
WARC-Record-ID: <urn:uuid:3084e79c-ae58-4bfd-8590-fcf2830fe896>\r\n\
WARC-Date: 2020-11-19T19:54:34Z\r\n\
WARC-Target-URI: https://example.com/ajax/bz?foo=bar\r\n\
WARC-Concurrent-To: <urn:uuid:073fac44-c383-4a2b-980d-76fec83bd20d>\r\n\
WARC-Block-Digest: sha1:LNYP3X3NWXQLUGDI745P4L4FK27XGP24\r\n\
Content-Type: application/http;msgtype=request\r\n\
Content-Length: 321\r\n\
\r\n\
POST /ajax/bz?foo=bar HTTP/1.1\r\n\
Content-Type: multipart/form-data; boundary=----WebKitFormBoundaryWUBf9liofZK0nuJd\r\n\
content-Length: 199\r\n\
\r\n\
------WebKitFormBoundaryWUBf9liofZK0nuJd\r\n\
Content-Disposition: form-data; name="q"\r\n\
\r\n\
[{"webSessionId":"pb2tr7:vx83uz:fdi8ta","user":"******"}]\r\n\
------WebKitFormBoundaryWUBf9liofZK0nuJd--\r\n\
\r\n\
'

    options = dict(include_all=True, append_post=True)
    buff = BytesIO()
    test_record = BytesIO(test_data)
    write_cdx_index(buff, test_record, 'test.warc.gz', **options)
    print(buff.getvalue())
    assert buff.getvalue() == b"""\
Exemplo n.º 9
0
def test_multipart_form_no_boundary():
    test_data = b'\
WARC/1.0\r\n\
WARC-Type: response\r\n\
WARC-Record-ID: <urn:uuid:3bc1606a-d517-487e-a6d5-bfeaebda2ec3>\r\n\
WARC-Date: 2020-11-19T14:02:52Z\r\n\
WARC-Target-URI: https://capi.connatix.com/core/story?v=77797\r\n\
WARC-IP-Address: 18.221.6.219\r\n\
Content-Type: application/http;msgtype=response\r\n\
WARC-Payload-Digest: sha1:SIGZ3RJW5J7DUKEZ4R7RSYUZNGLETIS5\r\n\
Content-Length: 41\r\n\
WARC-Block-Digest: sha1:JXKKZNALIPOW7J2FX5XUTGQZXKBSGZLU\r\n\
\r\n\
Content-Type: multipart/form-data\r\n\
\r\n\
ABCD\r\n\
\r\n\
\r\n\
\r\n\
WARC/1.0\r\n\
WARC-Type: request\r\n\
WARC-Record-ID: <urn:uuid:d5e7186f-5725-4ed1-b199-56fbdf4bd805>\r\n\
WARC-Date: 2020-11-19T14:02:52Z\r\n\
WARC-Target-URI: https://capi.connatix.com/core/story?v=77797\r\n\
WARC-Concurrent-To: <urn:uuid:3bc1606a-d517-487e-a6d5-bfeaebda2ec3>\r\n\
WARC-Block-Digest: sha1:QJ2YUIKEWDSCLK5A2DHGLQ7WWEKYMO3W\r\n\
Content-Type: application/http;msgtype=request\r\n\
Content-Length: 111\r\n\
\r\n\
POST /core/story?v=77797 HTTP/1.1\r\n\
Content-Length: 19\r\n\
Content-Type: multipart/form-data\r\n\
\r\n\
{"text": "default"}\r\n\
\r\n\
'

    options = dict(include_all=True, append_post=True)
    buff = BytesIO()
    test_record = BytesIO(test_data)
    write_cdx_index(buff, test_record, 'test.warc.gz', **options)
    assert buff.getvalue() == b"""\
Exemplo n.º 10
0
    def add_urls_to_index(self, stream, params, filename, length):
        base_filename = self._get_rel_or_base_name(filename, params)

        cdxout = BytesIO()
        write_cdx_index(cdxout,
                        stream,
                        base_filename,
                        cdxj=True,
                        append_post=True,
                        writer_cls=params.get('writer_cls'))

        z_key = res_template(self.redis_key_template, params)

        cdx_list = cdxout.getvalue().rstrip().split(b'\n')

        for cdx in cdx_list:
            if cdx:
                self.redis.zadd(z_key, 0, cdx)

        return cdx_list
Exemplo n.º 11
0
def test_invalid_decoding_uri_py2():
    test_data = b'\
WARC/1.0\r\n\
WARC-Type: resource\r\n\
WARC-Record-ID: <urn:uuid:12345678-feb0-11e6-8f83-68a86d1772ce>\r\n\
WARC-Target-URI: http://example.com/\xc3\x83\xc2\xa9\r\n\
WARC-Date: 2000-01-01T00:00:00Z\r\n\
Content-Type: text/plain\r\n\
Content-Length: 4\r\n\
\r\n\
ABCD\r\n\
\r\n'

    options = dict(include_all=True)

    buff = BytesIO()

    test_record = BytesIO(test_data)

    write_cdx_index(buff, test_record, 'test.warc.gz', **options)

    assert buff.getvalue() == b"""\
Exemplo n.º 12
0
def test_cdxj_middle_empty_records():
    empty_gzip_record = b'\x1f\x8b\x08\x00\x00\x00\x00\x00\x00\x03\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00'

    new_warc = BytesIO()

    with open(TEST_WARC_DIR + 'example2.warc.gz', 'rb') as fh:
        new_warc.write(empty_gzip_record)
        new_warc.write(fh.read())
        new_warc.write(empty_gzip_record)
        new_warc.write(empty_gzip_record)
        fh.seek(0)
        new_warc.write(fh.read())

    options = dict(cdxj=True)

    buff = BytesIO()
    new_warc.seek(0)

    write_cdx_index(buff, new_warc, 'empty.warc.gz', **options)

    lines = buff.getvalue().rstrip().split(b'\n')

    assert len(lines) == 2, lines
Exemplo n.º 13
0
def test_no_index_metadata_mime_textanvl():
    test_data = b'\
WARC/0.18\r\n\
WARC-Type: response\r\n\
WARC-Record-ID: <urn:uuid:1fd7789c-9cd5-47ea-b7ba-2a97dc06680b>\r\n\
WARC-Target-URI: http://example.com/xyz.pdf\r\n\
WARC-Date: 2014-04-01T05:20:11Z\r\n\
WARC-Payload-Digest: sha1:EDIYL6WNHDY62TPKUCPSEWMOAAGYTOAS\r\n\
Content-Type: application/http; msgtype=response\r\n\
Content-Length: 4\r\n\
\r\n\
ABCD\r\n\
\r\n\
\r\n\
\r\n\
WARC/0.18\r\n\
WARC-Type: metadata\r\n\
WARC-Record-ID: <urn:uuid:0735267f-5749-4c02-b08b-955af5d76032>\r\n\
WARC-Target-URI: http://example.com/xyz.pdf\r\n\
WARC-Date: 2014-04-01T05:20:11Z\r\n\
WARC-Payload-Digest: sha1:EDIYL6WNHDY62TPKUCPSEWMOAAGYTOAS\r\n\
Content-Type: text/anvl\r\n\
Content-Length: 4\r\n\
\r\n\
ABCD\r\n\
\r\n\
'

    options = dict(include_all=True)

    buff = BytesIO()

    test_record = BytesIO(test_data)

    write_cdx_index(buff, test_record, 'test.warc.gz', **options)

    assert buff.getvalue() == b"""\
Exemplo n.º 14
0
    def test_record_multiple_writes_keep_open(self):
        warc_path = to_path(self.root_dir +
                            '/warcs/FOO/ABC-{hostname}-{timestamp}.warc.gz')

        rel_path = to_path(self.root_dir + '/warcs/')

        dedup_index = self._get_dedup_index(user=False)

        writer = MultiFileWARCWriter(warc_path, dedup_index=dedup_index)
        recorder_app = RecorderApp(self.upstream_url, writer)

        # First Record
        resp = self._test_warc_write(recorder_app, 'httpbin.org',
                                     '/get?foo=bar',
                                     '&param.recorder.coll=FOO')

        assert b'HTTP/1.1 200 OK' in resp.body
        assert b'"foo": "bar"' in resp.body

        # Second Record
        resp = self._test_warc_write(recorder_app, 'httpbin.org',
                                     '/get?boo=far',
                                     '&param.recorder.coll=FOO')

        assert b'HTTP/1.1 200 OK' in resp.body
        assert b'"boo": "far"' in resp.body

        self._test_all_warcs('/warcs/FOO/', 1)

        # Check two records in WARC
        r = FakeStrictRedis.from_url('redis://localhost/2')
        res = r.zrangebylex('FOO:cdxj', '[org,httpbin)/', '(org,httpbin,')
        assert len(res) == 2

        files, coll_dir = self._test_all_warcs('/warcs/FOO/', 1)
        fullname = coll_dir + files[0]

        cdxout = BytesIO()
        with open(fullname, 'rb') as fh:
            filename = os.path.relpath(fullname, rel_path)
            write_cdx_index(cdxout,
                            fh,
                            filename,
                            cdxj=True,
                            append_post=True,
                            sort=True)

        res = [CDXObject(x) for x in res]

        cdxres = cdxout.getvalue().strip()
        cdxres = cdxres.split(b'\n')
        cdxres = [CDXObject(x) for x in cdxres]

        assert cdxres == res

        assert len(writer.fh_cache) == 1

        writer.close_key(to_path(self.root_dir + '/warcs/FOO/'))

        assert len(writer.fh_cache) == 0

        writer.close()

        resp = self._test_warc_write(recorder_app, 'httpbin.org',
                                     '/get?boo=far',
                                     '&param.recorder.coll=FOO')

        self._test_all_warcs('/warcs/FOO/', 2)

        warcs = r.hgetall('FOO:warc')
        assert len(warcs) == 2

        writer.close()
        assert len(writer.fh_cache) == 0
Exemplo n.º 15
0
    def test_record_multiple_writes_keep_open(self):
        warc_path = to_path(self.root_dir + '/warcs/FOO/ABC-{hostname}-{timestamp}.warc.gz')

        rel_path = to_path(self.root_dir + '/warcs/')

        dedup_index = self._get_dedup_index(user=False)

        writer = MultiFileWARCWriter(warc_path, dedup_index=dedup_index)
        recorder_app = RecorderApp(self.upstream_url, writer)

        # First Record
        resp = self._test_warc_write(recorder_app, 'httpbin.org',
                            '/get?foo=bar', '&param.recorder.coll=FOO')

        assert b'HTTP/1.1 200 OK' in resp.body
        assert b'"foo": "bar"' in resp.body


        # Second Record
        resp = self._test_warc_write(recorder_app, 'httpbin.org',
                            '/get?boo=far', '&param.recorder.coll=FOO')

        assert b'HTTP/1.1 200 OK' in resp.body
        assert b'"boo": "far"' in resp.body

        self._test_all_warcs('/warcs/FOO/', 1)

        # Check two records in WARC
        r = FakeStrictRedis.from_url('redis://localhost/2')
        res = r.zrangebylex('FOO:cdxj', '[org,httpbin)/', '(org,httpbin,')
        assert len(res) == 2

        files, coll_dir = self._test_all_warcs('/warcs/FOO/', 1)
        fullname = coll_dir + files[0]

        cdxout = BytesIO()
        with open(fullname, 'rb') as fh:
            filename = os.path.relpath(fullname, rel_path)
            write_cdx_index(cdxout, fh, filename,
                            cdxj=True, append_post=True, sort=True)

        res = [CDXObject(x) for x in res]

        cdxres = cdxout.getvalue().strip()
        cdxres = cdxres.split(b'\n')
        cdxres = [CDXObject(x) for x in cdxres]

        assert cdxres == res

        assert len(writer.fh_cache) == 1

        writer.close_key(to_path(self.root_dir + '/warcs/FOO/'))

        assert len(writer.fh_cache) == 0

        writer.close()

        resp = self._test_warc_write(recorder_app, 'httpbin.org',
                                '/get?boo=far', '&param.recorder.coll=FOO')

        self._test_all_warcs('/warcs/FOO/', 2)

        warcs = r.hgetall('FOO:warc')
        assert len(warcs) == 2

        writer.close()
        assert len(writer.fh_cache) == 0