Exemplo n.º 1
0
def test_to_str_2():
    res = str(StatusAndHeadersParser(['GET']).parse(StringIO(req_headers)))

    assert(res == req_headers)

    res = str(StatusAndHeadersParser(['GET']).parse(StringIO(req_headers + '\r\n')))

    assert(res == req_headers)
Exemplo n.º 2
0
    def __init__(self, verify_http=True, arc2warc=True):
        if arc2warc:
            self.arc_parser = ARC2WARCHeadersParser()
        else:
            self.arc_parser = ARCHeadersParser()

        self.warc_parser = StatusAndHeadersParser(self.WARC_TYPES)
        self.http_parser = StatusAndHeadersParser(self.HTTP_TYPES, verify_http)

        self.http_req_parser = StatusAndHeadersParser(self.HTTP_VERBS, verify_http)
Exemplo n.º 3
0
    def test_agg_local_revisit(self):
        resp = self.testapp.get(
            '/many/resource?url=http://www.example.com/&closest=20140127171251&sources=local'
        )

        assert resp.headers['Warcserver-Source-Coll'] == 'local:dupes.cdxj'

        buff = BytesIO(resp.body)
        status_headers = StatusAndHeadersParser(['WARC/1.0']).parse(buff)
        assert status_headers.get_header(
            'WARC-Target-URI') == 'http://example.com'
        assert status_headers.get_header('WARC-Date') == '2014-01-27T17:12:51Z'
        assert status_headers.get_header(
            'WARC-Refers-To-Target-URI') == 'http://example.com'
        assert status_headers.get_header(
            'WARC-Refers-To-Date') == '2014-01-27T17:12:00Z'

        assert resp.headers['Link'] == MementoUtils.make_link(
            'http://example.com', 'original')
        assert resp.headers[
            'Memento-Datetime'] == 'Mon, 27 Jan 2014 17:12:51 GMT'

        assert b'HTTP/1.1 200 OK' in resp.body
        assert b'<!doctype html>' in resp.body

        assert 'ResErrors' not in resp.headers
Exemplo n.º 4
0
    def __init__(self, gzip=True, *args, **kwargs):
        self.gzip = gzip
        self.hostname = gethostname()

        self.parser = StatusAndHeadersParser([], verify=False)

        self.warc_version = kwargs.get('warc_version', self.WARC_VERSION)
        self.header_filter = kwargs.get('header_filter')
Exemplo n.º 5
0
    def __init__(self, gzip=True, *args, **kwargs):
        super(BaseWARCWriter,
              self).__init__(warc_version=kwargs.get('warc_version'),
                             header_filter=kwargs.get('header_filter'))
        self.gzip = gzip
        self.hostname = gethostname()

        self.parser = StatusAndHeadersParser([], verify=False)
Exemplo n.º 6
0
    def test_record_param_user_coll_revisit(self):
        warc_path = to_path(self.root_dir + '/warcs/{user}/{coll}/')

        dedup_index = self._get_dedup_index()

        recorder_app = RecorderApp(
            self.upstream_url,
            PerRecordWARCWriter(warc_path, dedup_index=dedup_index))

        self._test_all_warcs('/warcs/USER/COLL/', 1)

        resp = self._test_warc_write(
            recorder_app, 'httpbin.org', '/user-agent',
            '&param.recorder.user=USER&param.recorder.coll=COLL')

        assert '"user-agent": "{0}"'.format(UA) in resp.text
        #assert b'HTTP/1.1 200 OK' in resp.body
        #assert b'"foo": "bar"' in resp.body

        self._test_all_warcs('/warcs/USER/COLL/', 2)

        # Test Redis CDX
        r = FakeStrictRedis.from_url('redis://localhost/2')

        res = r.zrangebylex('USER:COLL:cdxj', '[org,httpbin)/',
                            '(org,httpbin,')
        assert len(res) == 2

        if b'warc/revisit' in res[0]:
            cdx = CDXObject(res[0])
        else:
            cdx = CDXObject(res[1])

        assert cdx['urlkey'] == 'org,httpbin)/user-agent'
        assert cdx['mime'] == 'warc/revisit'
        assert cdx['offset'] == '0'
        assert cdx['filename'].startswith(to_path('USER/COLL/'))
        assert cdx['filename'].endswith('.warc.gz')

        fullwarc = os.path.join(self.root_dir, 'warcs', cdx['filename'])

        warcs = r.hgetall('USER:COLL:warc')
        assert len(warcs) == 2
        assert warcs[cdx['filename'].encode('utf-8')] == fullwarc.encode(
            'utf-8')

        with open(fullwarc, 'rb') as fh:
            decomp = DecompressingBufferedReader(fh)
            # Test refers-to headers
            status_headers = StatusAndHeadersParser(['WARC/1.0']).parse(decomp)
            assert status_headers.get_header('WARC-Type') == 'revisit'
            assert status_headers.get_header(
                'WARC-Target-URI') == 'http://httpbin.org/user-agent'
            assert status_headers.get_header('WARC-Date') != ''
            assert status_headers.get_header(
                'WARC-Refers-To-Target-URI') == 'http://httpbin.org/user-agent'
            assert status_headers.get_header('WARC-Refers-To-Date') != ''
Exemplo n.º 7
0
def test_to_str_with_remove():
    res = StatusAndHeadersParser(['GET']).parse(StringIO(req_headers))
    res.remove_header('Foo')

    exp = "\
GET / HTTP/1.0\r\n\
Content-Length: 0\r\n"

    assert (str(res) == exp)
Exemplo n.º 8
0
 def _check_uri_date(self, resp, uri, dt):
     buff = BytesIO(resp.body)
     buff = ChunkedDataReader(buff)
     status_headers = StatusAndHeadersParser(['WARC/1.0']).parse(buff)
     assert status_headers.get_header('WARC-Target-URI') == uri
     if dt == True:
         assert status_headers.get_header('WARC-Date') != ''
     else:
         assert status_headers.get_header('WARC-Date') == dt
Exemplo n.º 9
0
def test_to_str_1():
    res = str(StatusAndHeadersParser(['HTTP/1.0']).parse(StringIO(status_headers_1)))

    exp = "\
HTTP/1.0 200 OK\r\n\
Content-Type: ABC\r\n\
Some: Value\r\n\
Multi-Line: Value1    Also This\r\n\
"
    assert(res == exp)
Exemplo n.º 10
0
    def __init__(self, paths, cdx_source):
        self.paths = paths

        self.resolvers = self.make_resolvers(self.paths)

        self.resolve_loader = ResolvingLoader(self.resolvers,
                                              no_record_parse=True)

        self.headers_parser = StatusAndHeadersParser([], verify=False)

        self.cdx_source = cdx_source
Exemplo n.º 11
0
def test_to_str_exclude():
    def exclude(h):
        if h[0].lower() == 'multi-line':
            return None

        return h

    sah = StatusAndHeadersParser(['HTTP/1.0']).parse(StringIO(status_headers_1))
    res = sah.to_str(exclude)

    exp = "\
HTTP/1.0 200 OK\r\n\
Content-Type: ABC\r\n\
Some: Value\r\n\
"
    assert(res == exp)

    assert(sah.to_bytes(exclude) == (exp.encode('latin-1') + b'\r\n'))
Exemplo n.º 12
0
    def __init__(self, env):
        self.env = env

        parser = StatusAndHeadersParser([], verify=False)

        self.status_headers = parser.parse(self.env['wsgi.input'])
Exemplo n.º 13
0
def test_status_one_word():
    res = StatusAndHeadersParser(['GET'], verify=False).parse(StringIO('A'))
    assert (str(res) == 'A\r\n')
Exemplo n.º 14
0
def test_status_empty():
    with pytest.raises(EOFError):
        StatusAndHeadersParser([], verify=False).parse(StringIO(''))