def test_anon_download_coll(self): res = self._get_anon('/temp/$download') assert res.headers['Content-Disposition'].startswith("attachment; filename*=UTF-8''temp-") warcin = self._get_dechunked(res.body) cdxout = BytesIO() write_cdx_index(cdxout, warcin, 'temp.warc.gz', include_all=True, cdxj=True) cdx = [CDXObject(cdx) for cdx in cdxout.getvalue().rstrip().split(b'\n')] assert len(cdx) == 12 # response cdx[0]['url'] = 'http://httpbin.org/get?food=bar' cdx[0]['mime'] = 'application/json' # request cdx[1]['url'] = 'http://httpbin.org/get?food=bar' cdx[1]['mime'] = '-' # response cdx[2]['url'] = 'http://httpbin.org/get?bood=far' cdx[2]['mime'] = 'application/json' # request cdx[3]['url'] = 'http://httpbin.org/get?bood=far' cdx[3]['mime'] = '-' # request cdx[4]['url'] = 'http://httpbin.org/get?boof=mar' cdx[4]['mime'] = '-'
def test_anon_download_coll(self): res = self._get_anon('/temp/$download') assert res.headers['Content-Disposition'].startswith( "attachment; filename*=UTF-8''temp-") warcin = self._get_dechunked(res.body) cdxout = BytesIO() write_cdx_index(cdxout, warcin, 'temp.warc.gz', include_all=True, cdxj=True) cdx = [ CDXObject(cdx) for cdx in cdxout.getvalue().rstrip().split(b'\n') ] assert len(cdx) == 10 # response cdx[0]['url'] = 'http://httpbin.org/get?food=bar' cdx[0]['mime'] = 'application/json' # request cdx[1]['url'] = 'http://httpbin.org/get?food=bar' cdx[1]['mime'] = '-' # response cdx[2]['url'] = 'http://httpbin.org/get?bood=far' cdx[2]['mime'] = 'application/json' # request cdx[3]['url'] = 'http://httpbin.org/get?bood=far' cdx[3]['mime'] = '-'
def cdx_index(warc, **options): buff = BytesIO() with open(TEST_WARC_DIR + warc, 'rb') as fh: write_cdx_index(buff, fh, warc, **options) return buff.getvalue()
def test_cdxj_empty(): options = dict(cdxj=True) buff = BytesIO() empty = BytesIO() write_cdx_index(buff, empty, 'empty.warc.gz', **options) assert buff.getvalue() == b''
def _load_and_index(self, warc_path): cdx_path = self._conv_warc_to_cdx_path(warc_path) LOG.info('Indexing WARC: ' + warc_path) s3client = boto3.client('s3', config=self.boto_config) if self.options.skip_existing: try: s3client.head_object(Bucket=self.options.cdx_bucket, Key=cdx_path) LOG.info('Already Exists: ' + cdx_path) return except botocore.client.ClientError as exception: pass # ok, not found try: s3client.head_object(Bucket=self.options.warc_bucket, Key=warc_path) except botocore.client.ClientError as exception: LOG.error('WARC not found: ' + warc_path) return with TemporaryFile(mode='w+b', dir=self.options.s3_local_temp_dir) as warctemp: LOG.info('Fetching WARC: ' + warc_path) try: s3client.download_fileobj(self.options.warc_bucket, warc_path, warctemp) except botocore.client.ClientError as exception: LOG.error('Failed to download {}: {}'.format(warc_path, exception)) return warctemp.seek(0) LOG.info('Successfully fetched WARC: ' + warc_path) with TemporaryFile(mode='w+b', dir=self.options.s3_local_temp_dir) as cdxtemp: with GzipFile(fileobj=cdxtemp, mode='w+b') as cdxfile: # Index to temp write_cdx_index(cdxfile, warctemp, warc_path, **self.index_options) # Upload temp cdxtemp.flush() cdxtemp.seek(0) LOG.info('Uploading CDX: ' + cdx_path) try: s3client.upload_fileobj(cdxtemp, self.options.cdx_bucket, cdx_path) except botocore.client.ClientError as exception: LOG.error('Failed to upload {}: {}'.format(cdx_path, exception)) return LOG.info('Successfully uploaded CDX: ' + cdx_path)
def add_urls_to_index(self, stream, params, filename, length): base_filename = self._get_rel_or_base_name(filename, params) cdxout = BytesIO() write_cdx_index(cdxout, stream, base_filename, cdxj=True, append_post=True, writer_cls=params.get('writer_cls')) z_key = res_template(self.redis_key_template, params) cdx_list = cdxout.getvalue().rstrip().split(b'\n') for cdx in cdx_list: if cdx: self.redis.zadd(z_key, 0, cdx) return cdx_list
def test_multipart_form(): test_data = b'\ WARC/1.0\r\n\ WARC-Type: response\r\n\ WARC-Record-ID: <urn:uuid:073fac44-c383-4a2b-980d-76fec83bd20d>\r\n\ WARC-Date: 2020-11-19T19:54:34Z\r\n\ WARC-Target-URI: https://example.com/ajax/bz?foo=bar\r\n\ Content-Type: application/http;msgtype=response\r\n\ WARC-Payload-Digest: sha1:3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ\r\n\ Content-Length: 48\r\n\ WARC-Block-Digest: sha1:XN45YTSBLG5PLJ4HA7DRDYGJBM5VW4UO\r\n\ \r\n\ Content-Type: text/html; charset="utf-8"\r\n\ \r\n\ ABCD\r\n\ \r\n\ \r\n\ \r\n\ WARC/1.0\r\n\ WARC-Type: request\r\n\ WARC-Record-ID: <urn:uuid:3084e79c-ae58-4bfd-8590-fcf2830fe896>\r\n\ WARC-Date: 2020-11-19T19:54:34Z\r\n\ WARC-Target-URI: https://example.com/ajax/bz?foo=bar\r\n\ WARC-Concurrent-To: <urn:uuid:073fac44-c383-4a2b-980d-76fec83bd20d>\r\n\ WARC-Block-Digest: sha1:LNYP3X3NWXQLUGDI745P4L4FK27XGP24\r\n\ Content-Type: application/http;msgtype=request\r\n\ Content-Length: 321\r\n\ \r\n\ POST /ajax/bz?foo=bar HTTP/1.1\r\n\ Content-Type: multipart/form-data; boundary=----WebKitFormBoundaryWUBf9liofZK0nuJd\r\n\ content-Length: 199\r\n\ \r\n\ ------WebKitFormBoundaryWUBf9liofZK0nuJd\r\n\ Content-Disposition: form-data; name="q"\r\n\ \r\n\ [{"webSessionId":"pb2tr7:vx83uz:fdi8ta","user":"******"}]\r\n\ ------WebKitFormBoundaryWUBf9liofZK0nuJd--\r\n\ \r\n\ ' options = dict(include_all=True, append_post=True) buff = BytesIO() test_record = BytesIO(test_data) write_cdx_index(buff, test_record, 'test.warc.gz', **options) print(buff.getvalue()) assert buff.getvalue() == b"""\
def test_multipart_form_no_boundary(): test_data = b'\ WARC/1.0\r\n\ WARC-Type: response\r\n\ WARC-Record-ID: <urn:uuid:3bc1606a-d517-487e-a6d5-bfeaebda2ec3>\r\n\ WARC-Date: 2020-11-19T14:02:52Z\r\n\ WARC-Target-URI: https://capi.connatix.com/core/story?v=77797\r\n\ WARC-IP-Address: 18.221.6.219\r\n\ Content-Type: application/http;msgtype=response\r\n\ WARC-Payload-Digest: sha1:SIGZ3RJW5J7DUKEZ4R7RSYUZNGLETIS5\r\n\ Content-Length: 41\r\n\ WARC-Block-Digest: sha1:JXKKZNALIPOW7J2FX5XUTGQZXKBSGZLU\r\n\ \r\n\ Content-Type: multipart/form-data\r\n\ \r\n\ ABCD\r\n\ \r\n\ \r\n\ \r\n\ WARC/1.0\r\n\ WARC-Type: request\r\n\ WARC-Record-ID: <urn:uuid:d5e7186f-5725-4ed1-b199-56fbdf4bd805>\r\n\ WARC-Date: 2020-11-19T14:02:52Z\r\n\ WARC-Target-URI: https://capi.connatix.com/core/story?v=77797\r\n\ WARC-Concurrent-To: <urn:uuid:3bc1606a-d517-487e-a6d5-bfeaebda2ec3>\r\n\ WARC-Block-Digest: sha1:QJ2YUIKEWDSCLK5A2DHGLQ7WWEKYMO3W\r\n\ Content-Type: application/http;msgtype=request\r\n\ Content-Length: 111\r\n\ \r\n\ POST /core/story?v=77797 HTTP/1.1\r\n\ Content-Length: 19\r\n\ Content-Type: multipart/form-data\r\n\ \r\n\ {"text": "default"}\r\n\ \r\n\ ' options = dict(include_all=True, append_post=True) buff = BytesIO() test_record = BytesIO(test_data) write_cdx_index(buff, test_record, 'test.warc.gz', **options) assert buff.getvalue() == b"""\
def test_invalid_decoding_uri_py2(): test_data = b'\ WARC/1.0\r\n\ WARC-Type: resource\r\n\ WARC-Record-ID: <urn:uuid:12345678-feb0-11e6-8f83-68a86d1772ce>\r\n\ WARC-Target-URI: http://example.com/\xc3\x83\xc2\xa9\r\n\ WARC-Date: 2000-01-01T00:00:00Z\r\n\ Content-Type: text/plain\r\n\ Content-Length: 4\r\n\ \r\n\ ABCD\r\n\ \r\n' options = dict(include_all=True) buff = BytesIO() test_record = BytesIO(test_data) write_cdx_index(buff, test_record, 'test.warc.gz', **options) assert buff.getvalue() == b"""\
def test_cdxj_middle_empty_records(): empty_gzip_record = b'\x1f\x8b\x08\x00\x00\x00\x00\x00\x00\x03\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00' new_warc = BytesIO() with open(TEST_WARC_DIR + 'example2.warc.gz', 'rb') as fh: new_warc.write(empty_gzip_record) new_warc.write(fh.read()) new_warc.write(empty_gzip_record) new_warc.write(empty_gzip_record) fh.seek(0) new_warc.write(fh.read()) options = dict(cdxj=True) buff = BytesIO() new_warc.seek(0) write_cdx_index(buff, new_warc, 'empty.warc.gz', **options) lines = buff.getvalue().rstrip().split(b'\n') assert len(lines) == 2, lines
def test_no_index_metadata_mime_textanvl(): test_data = b'\ WARC/0.18\r\n\ WARC-Type: response\r\n\ WARC-Record-ID: <urn:uuid:1fd7789c-9cd5-47ea-b7ba-2a97dc06680b>\r\n\ WARC-Target-URI: http://example.com/xyz.pdf\r\n\ WARC-Date: 2014-04-01T05:20:11Z\r\n\ WARC-Payload-Digest: sha1:EDIYL6WNHDY62TPKUCPSEWMOAAGYTOAS\r\n\ Content-Type: application/http; msgtype=response\r\n\ Content-Length: 4\r\n\ \r\n\ ABCD\r\n\ \r\n\ \r\n\ \r\n\ WARC/0.18\r\n\ WARC-Type: metadata\r\n\ WARC-Record-ID: <urn:uuid:0735267f-5749-4c02-b08b-955af5d76032>\r\n\ WARC-Target-URI: http://example.com/xyz.pdf\r\n\ WARC-Date: 2014-04-01T05:20:11Z\r\n\ WARC-Payload-Digest: sha1:EDIYL6WNHDY62TPKUCPSEWMOAAGYTOAS\r\n\ Content-Type: text/anvl\r\n\ Content-Length: 4\r\n\ \r\n\ ABCD\r\n\ \r\n\ ' options = dict(include_all=True) buff = BytesIO() test_record = BytesIO(test_data) write_cdx_index(buff, test_record, 'test.warc.gz', **options) assert buff.getvalue() == b"""\
def test_record_multiple_writes_keep_open(self): warc_path = to_path(self.root_dir + '/warcs/FOO/ABC-{hostname}-{timestamp}.warc.gz') rel_path = to_path(self.root_dir + '/warcs/') dedup_index = self._get_dedup_index(user=False) writer = MultiFileWARCWriter(warc_path, dedup_index=dedup_index) recorder_app = RecorderApp(self.upstream_url, writer) # First Record resp = self._test_warc_write(recorder_app, 'httpbin.org', '/get?foo=bar', '¶m.recorder.coll=FOO') assert b'HTTP/1.1 200 OK' in resp.body assert b'"foo": "bar"' in resp.body # Second Record resp = self._test_warc_write(recorder_app, 'httpbin.org', '/get?boo=far', '¶m.recorder.coll=FOO') assert b'HTTP/1.1 200 OK' in resp.body assert b'"boo": "far"' in resp.body self._test_all_warcs('/warcs/FOO/', 1) # Check two records in WARC r = FakeStrictRedis.from_url('redis://localhost/2') res = r.zrangebylex('FOO:cdxj', '[org,httpbin)/', '(org,httpbin,') assert len(res) == 2 files, coll_dir = self._test_all_warcs('/warcs/FOO/', 1) fullname = coll_dir + files[0] cdxout = BytesIO() with open(fullname, 'rb') as fh: filename = os.path.relpath(fullname, rel_path) write_cdx_index(cdxout, fh, filename, cdxj=True, append_post=True, sort=True) res = [CDXObject(x) for x in res] cdxres = cdxout.getvalue().strip() cdxres = cdxres.split(b'\n') cdxres = [CDXObject(x) for x in cdxres] assert cdxres == res assert len(writer.fh_cache) == 1 writer.close_key(to_path(self.root_dir + '/warcs/FOO/')) assert len(writer.fh_cache) == 0 writer.close() resp = self._test_warc_write(recorder_app, 'httpbin.org', '/get?boo=far', '¶m.recorder.coll=FOO') self._test_all_warcs('/warcs/FOO/', 2) warcs = r.hgetall('FOO:warc') assert len(warcs) == 2 writer.close() assert len(writer.fh_cache) == 0