예제 #1
0
    def _add_memento_links(self, url, full_prefix, memento_dt, memento_ts,
                           status_headers, is_timegate, is_proxy, coll=None):

        # memento url + header
        if not memento_dt and memento_ts:
            memento_dt = timestamp_to_http_date(memento_ts)

        if memento_dt:
            status_headers.headers.append(('Memento-Datetime', memento_dt))

            if is_proxy:
                memento_url = url
            else:
                memento_url = full_prefix + memento_ts + self.replay_mod
                memento_url += '/' + url
        else:
            memento_url = None

        timegate_url, timemap_url = self._get_timegate_timemap(url, full_prefix)

        link = []
        if not is_proxy:
            link.append(MementoUtils.make_link(url, 'original'))
            link.append(MementoUtils.make_link(timegate_url, 'timegate'))
            link.append(MementoUtils.make_link(timemap_url, 'timemap'))

        if memento_dt:
            link.append(MementoUtils.make_memento_link(memento_url, 'memento', memento_dt, coll))

        link_str = ', '.join(link)

        status_headers.headers.append(('Link', link_str))

        if is_timegate:
            status_headers.headers.append(('Vary', 'accept-datetime'))
예제 #2
0
    def test_agg_post_resolve_postreq(self):
        req_data = """\
POST /post HTTP/1.1
content-length: 16
accept-encoding: gzip, deflate
accept: */*
host: httpbin.org
content-type: application/x-www-form-urlencoded

foo=bar&test=abc"""

        resp = self.testapp.post('/posttest/resource/postreq?url=http://httpbin.org/post', req_data)

        assert resp.headers['Warcserver-Source-Coll'] == 'post'

        self._check_uri_date(resp, 'http://httpbin.org/post', True)

        assert resp.headers['Link'] == MementoUtils.make_link('http://httpbin.org/post', 'original')
        assert resp.headers['Memento-Datetime'] != ''

        assert b'HTTP/1.1 200 OK' in resp.body
        assert b'"foo": "bar"' in resp.body
        assert b'"test": "abc"' in resp.body
        assert b'"url": "http://httpbin.org/post"' in resp.body

        assert 'ResErrors' not in resp.headers
예제 #3
0
    def test_agg_post_resolve_postreq(self):
        req_data = """\
POST /post HTTP/1.1
content-length: 16
accept-encoding: gzip, deflate
accept: */*
host: httpbin.org
content-type: application/x-www-form-urlencoded

foo=bar&test=abc"""

        resp = self.testapp.post('/posttest/resource/postreq?url=http://httpbin.org/post', req_data)

        assert resp.headers['Warcserver-Source-Coll'] == 'post'

        self._check_uri_date(resp, 'http://httpbin.org/post', True)

        assert resp.headers['Link'] == MementoUtils.make_link('http://httpbin.org/post', 'original')
        assert resp.headers['Memento-Datetime'] != ''

        assert b'HTTP/1.1 200 OK' in resp.body
        assert b'"foo": "bar"' in resp.body
        assert b'"test": "abc"' in resp.body
        assert b'"url": "http://httpbin.org/post"' in resp.body

        assert 'ResErrors' not in resp.headers
예제 #4
0
    def _test_warc_write(self,
                         recorder_app,
                         host,
                         path,
                         other_params='',
                         link_url=''):
        url = 'http://' + host + path
        req_url = '/live/resource/postreq?url=' + url + other_params
        testapp = webtest.TestApp(recorder_app)
        resp = testapp.post(
            req_url,
            general_req_data.format(host=host, path=path).encode('utf-8'))

        if not recorder_app.write_queue.empty():
            recorder_app._write_one()

        assert resp.headers['Warcserver-Source-Coll'] == 'live'

        if not link_url:
            link_url = unquote(url)

        assert resp.headers['Link'] == MementoUtils.make_link(
            link_url, 'original')
        assert resp.headers['Memento-Datetime'] != ''

        return resp
예제 #5
0
    def test_agg_local_revisit(self):
        resp = self.testapp.get(
            '/many/resource?url=http://www.example.com/&closest=20140127171251&sources=local'
        )

        assert resp.headers['Warcserver-Source-Coll'] == 'local:dupes.cdxj'

        buff = BytesIO(resp.body)
        status_headers = StatusAndHeadersParser(['WARC/1.0']).parse(buff)
        assert status_headers.get_header(
            'WARC-Target-URI') == 'http://example.com'
        assert status_headers.get_header('WARC-Date') == '2014-01-27T17:12:51Z'
        assert status_headers.get_header(
            'WARC-Refers-To-Target-URI') == 'http://example.com'
        assert status_headers.get_header(
            'WARC-Refers-To-Date') == '2014-01-27T17:12:00Z'

        assert resp.headers['Link'] == MementoUtils.make_link(
            'http://example.com', 'original')
        assert resp.headers[
            'Memento-Datetime'] == 'Mon, 27 Jan 2014 17:12:51 GMT'

        assert b'HTTP/1.1 200 OK' in resp.body
        assert b'<!doctype html>' in resp.body

        assert 'ResErrors' not in resp.headers
예제 #6
0
    def test_agg_live_postreq(self):
        req_data = """\
GET /get?foo=bar HTTP/1.1
Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8
User-agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36
Host: httpbin.org
"""

        resp = self.testapp.post(
            '/many/resource/postreq?url=http://httpbin.org/get?foo=bar&closest=now',
            req_data)

        assert resp.headers['Warcserver-Source-Coll'] == 'live'

        self._check_uri_date(resp, 'http://httpbin.org/get?foo=bar', True)

        assert resp.headers['Link'] == MementoUtils.make_link(
            'http://httpbin.org/get?foo=bar', 'original')
        assert resp.headers['Memento-Datetime'] != ''

        assert b'HTTP/1.1 200 OK' in resp.body
        assert b'"foo": "bar"' in resp.body

        #assert json.loads(resp.headers['ResErrors']) == {"rhiz": "NotFoundException('http://webenact.rhizome.org/vvork/http://httpbin.org/get?foo=bar',)"}
        assert "NotFoundException('http://webenact.rhizome.org/vvork/" in json.loads(
            resp.headers['ResErrors'])['rhiz']
예제 #7
0
    def test_live_video_loader_post(self):
        pytest.importorskip('youtube_dl')
        req_data = """\
GET /v/BfBgWtAIbRc HTTP/1.1
accept-encoding: gzip, deflate
accept: */*
host: www.youtube.com\
"""

        params = {
            'url': 'http://www.youtube.com/v/BfBgWtAIbRc',
            'content_type': 'application/vnd.youtube-dl_formats+json'
        }

        resp = self.testapp.post(
            '/live/resource/postreq?&' + urlencode(params), req_data)

        assert resp.headers['Warcserver-Source-Coll'] == 'live'

        self._check_uri_date(resp, 'metadata://www.youtube.com/v/BfBgWtAIbRc',
                             True)

        assert resp.headers['Link'] == MementoUtils.make_link(
            'metadata://www.youtube.com/v/BfBgWtAIbRc', 'original')
        assert resp.headers['Memento-Datetime'] != ''

        assert b'WARC-Type: metadata' in resp.body
        assert b'Content-Type: application/vnd.youtube-dl_formats+json' in resp.body
예제 #8
0
    def test_agg_select_local_postreq(self):
        req_data = """\
GET / HTTP/1.1
Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8
User-agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36
Host: iana.org
"""

        resp = self.testapp.post(
            '/many/resource/postreq?url=http://iana.org/&closest=20140126200624',
            req_data)

        assert resp.headers['Warcserver-Source-Coll'] == 'local:iana.cdxj'

        self._check_uri_date(resp, 'http://www.iana.org/',
                             '2014-01-26T20:06:24Z')

        assert resp.headers['Link'] == MementoUtils.make_link(
            'http://www.iana.org/', 'original')
        assert resp.headers[
            'Memento-Datetime'] == 'Sun, 26 Jan 2014 20:06:24 GMT'

        assert json.loads(resp.headers['ResErrors']) == {
            "rhiz":
            "NotFoundException('http://webenact.rhizome.org/vvork/http://iana.org/',)"
        }
예제 #9
0
    def test_agg_select_live(self):
        resp = self.testapp.get('/many/resource?url=http://vvork.com/&closest=now')

        assert resp.headers['Warcserver-Source-Coll'] == 'live'

        self._check_uri_date(resp, 'http://vvork.com/', True)

        assert resp.headers['Link'] == MementoUtils.make_link('http://vvork.com/', 'original')
        assert resp.headers['Memento-Datetime'] != ''

        assert 'ResErrors' not in resp.headers
예제 #10
0
    def test_url_agnost(self):
        f = FakeStrictRedis.from_url('redis://localhost/2')
        f.hset('test:foo:warc', 'example-url-agnostic-revisit.warc.gz', TEST_WARC_PATH + 'example-url-agnostic-revisit.warc.gz')
        f.hset('test:foo:warc', 'example-url-agnostic-orig.warc.gz', TEST_WARC_PATH + 'example-url-agnostic-orig.warc.gz')

        resp = self.testapp.get('/urlagnost/resource?url=http://example.com/&param.arg=foo')

        assert resp.status_int == 200
        assert resp.headers['Link'] == MementoUtils.make_link('http://[email protected]/', 'original')
        assert resp.headers['Warcserver-Source-Coll'] == 'url-agnost'
        assert resp.headers['Memento-Datetime'] == 'Mon, 29 Jul 2013 19:51:51 GMT'
예제 #11
0
    def test_url_agnost(self):
        f = FakeStrictRedis.from_url('redis://localhost/2')
        f.hset('test:foo:warc', 'example-url-agnostic-revisit.warc.gz', TEST_WARC_PATH + 'example-url-agnostic-revisit.warc.gz')
        f.hset('test:foo:warc', 'example-url-agnostic-orig.warc.gz', TEST_WARC_PATH + 'example-url-agnostic-orig.warc.gz')

        resp = self.testapp.get('/urlagnost/resource?url=http://example.com/&param.arg=foo')

        assert resp.status_int == 200
        assert resp.headers['Link'] == MementoUtils.make_link('http://[email protected]/', 'original')
        assert resp.headers['Warcserver-Source-Coll'] == 'url-agnost'
        assert resp.headers['Memento-Datetime'] == 'Mon, 29 Jul 2013 19:51:51 GMT'
예제 #12
0
    def test_agg_select_live(self):
        resp = self.testapp.get('/many/resource?url=http://vvork.com/&closest=now')

        assert resp.headers['Warcserver-Source-Coll'] == 'live'

        self._check_uri_date(resp, 'http://vvork.com/', True)

        assert resp.headers['Link'] == MementoUtils.make_link('http://vvork.com/', 'original')
        assert resp.headers['Memento-Datetime'] != ''

        assert 'ResErrors' not in resp.headers
예제 #13
0
    def test_agg_select_local(self):
        resp = self.testapp.get('/many/resource?url=http://iana.org/&closest=20140126200624')

        assert resp.headers['Warcserver-Source-Coll'] == 'local:iana.cdxj'

        self._check_uri_date(resp, 'http://www.iana.org/', '2014-01-26T20:06:24Z')

        assert resp.headers['Link'] == MementoUtils.make_link('http://www.iana.org/', 'original')
        assert resp.headers['Memento-Datetime'] == 'Sun, 26 Jan 2014 20:06:24 GMT'

        assert json.loads(resp.headers['ResErrors']) == {"rhiz": "NotFoundException('http://webenact.rhizome.org/vvork/http://iana.org/',)"}
예제 #14
0
    def test_agg_select_local(self):
        resp = self.testapp.get('/many/resource?url=http://iana.org/&closest=20140126200624')

        assert resp.headers['Warcserver-Source-Coll'] == 'local:iana.cdxj'

        self._check_uri_date(resp, 'http://www.iana.org/', '2014-01-26T20:06:24Z')

        assert resp.headers['Link'] == MementoUtils.make_link('http://www.iana.org/', 'original')
        assert resp.headers['Memento-Datetime'] == 'Sun, 26 Jan 2014 20:06:24 GMT'

        assert json.loads(resp.headers['ResErrors']) == {"rhiz": "NotFoundException('https://webenact.rhizome.org/vvork/http://iana.org/',)"}
예제 #15
0
    def send_redirect(self, new_path, url_parts, urlrewriter):
        scheme, netloc, path, query, frag = url_parts
        path = new_path
        url = urlunsplit((scheme, netloc, path, query, frag))
        resp = WbResponse.redir_response(urlrewriter.rewrite(url),
                                         '307 Temporary Redirect')

        if self.enable_memento:
            resp.status_headers['Link'] = MementoUtils.make_link(url, 'original')

        return resp
예제 #16
0
    def test_agg_seq_fallback_1(self):
        resp = self.testapp.get('/fallback/resource?url=http://httpbin.org/status/200')

        assert resp.headers['Warcserver-Source-Coll'] == 'live'

        self._check_uri_date(resp, 'http://httpbin.org/status/200', True)

        assert resp.headers['Link'] == MementoUtils.make_link('http://httpbin.org/status/200', 'original')

        assert b'HTTP/1.1 200 OK' in resp.body

        assert 'ResErrors' not in resp.headers
예제 #17
0
    def test_agg_seq_fallback_1(self):
        resp = self.testapp.get('/fallback/resource?url=http://httpbin.org/status/200')

        assert resp.headers['Warcserver-Source-Coll'] == 'live'

        self._check_uri_date(resp, 'http://httpbin.org/status/200', True)

        assert resp.headers['Link'] == MementoUtils.make_link('http://httpbin.org/status/200', 'original')

        assert b'HTTP/1.1 200 OK' in resp.body

        assert 'ResErrors' not in resp.headers
예제 #18
0
    def test_agg_seq_fallback_2(self):
        resp = self.testapp.get('/fallback/resource?url=http://www.example.com/')

        assert resp.headers['Warcserver-Source-Coll'] == 'example'

        self._check_uri_date(resp, 'http://example.com/', '2016-02-25T04:23:29Z')

        assert resp.headers['Link'] == MementoUtils.make_link('http://example.com/', 'original')
        assert resp.headers['Memento-Datetime'] == 'Thu, 25 Feb 2016 04:23:29 GMT'

        assert b'HTTP/1.1 200 OK' in resp.body

        assert 'ResErrors' not in resp.headers
예제 #19
0
    def test_agg_select_mem_1(self):
        resp = self.testapp.get('/many/resource?url=http://vvork.com/&closest=20141001')

        assert resp.headers['Warcserver-Source-Coll'] == 'rhiz'

        self._check_uri_date(resp, 'http://www.vvork.com/', '2014-10-06T18:43:57Z')

        assert b'HTTP/1.1 200 OK' in resp.body

        assert resp.headers['Link'] == MementoUtils.make_link('http://www.vvork.com/', 'original')
        assert resp.headers['Memento-Datetime'] == 'Mon, 06 Oct 2014 18:43:57 GMT'

        assert 'ResErrors' not in resp.headers
예제 #20
0
    def test_agg_select_mem_1(self):
        resp = self.testapp.get('/many/resource?url=http://vvork.com/&closest=20141001')

        assert resp.headers['Warcserver-Source-Coll'] == 'rhiz'

        self._check_uri_date(resp, 'http://www.vvork.com/', '2014-10-06T18:43:57Z')

        assert b'HTTP/1.1 200 OK' in resp.body

        assert resp.headers['Link'] == MementoUtils.make_link('http://www.vvork.com/', 'original')
        assert resp.headers['Memento-Datetime'] == 'Mon, 06 Oct 2014 18:43:57 GMT'

        assert 'ResErrors' not in resp.headers
예제 #21
0
    def test_agg_seq_fallback_2(self):
        resp = self.testapp.get('/fallback/resource?url=http://www.example.com/')

        assert resp.headers['Warcserver-Source-Coll'] == 'example'

        self._check_uri_date(resp, 'http://example.com/', '2016-02-25T04:23:29Z')

        assert resp.headers['Link'] == MementoUtils.make_link('http://example.com/', 'original')
        assert resp.headers['Memento-Datetime'] == 'Thu, 25 Feb 2016 04:23:29 GMT'

        assert b'HTTP/1.1 200 OK' in resp.body

        assert 'ResErrors' not in resp.headers
예제 #22
0
    def test_agg_select_mem_2(self):
        resp = self.testapp.get('/many/resource?url=http://vvork.com/&closest=20151231')

        assert resp.headers['Warcserver-Source-Coll'] == 'ia'

        self._check_uri_date(resp, 'http://vvork.com/', '2016-01-10T13:48:55Z')

        assert b'HTTP/1.1 200 OK' in resp.body

        assert resp.headers['Link'] == MementoUtils.make_link('http://vvork.com/', 'original')
        assert resp.headers['Memento-Datetime'] == 'Sun, 10 Jan 2016 13:48:55 GMT'

        assert 'ResErrors' not in resp.headers
예제 #23
0
    def test_agg_select_mem_2(self):
        resp = self.testapp.get('/many/resource?url=http://vvork.com/&closest=20151231')

        assert resp.headers['Warcserver-Source-Coll'] == 'ia'

        self._check_uri_date(resp, 'http://vvork.com/', '2016-01-10T13:48:55Z')

        assert b'HTTP/1.1 200 OK' in resp.body

        assert resp.headers['Link'] == MementoUtils.make_link('http://vvork.com/', 'original')
        assert resp.headers['Memento-Datetime'] == 'Sun, 10 Jan 2016 13:48:55 GMT'

        assert 'ResErrors' not in resp.headers
예제 #24
0
    def test_live_post_resource(self):
        resp = self.testapp.post('/live/resource?url=http://httpbin.org/post',
                                 OrderedDict([('foo', 'bar')]))

        assert resp.headers['Warcserver-Source-Coll'] == 'live'

        self._check_uri_date(resp, 'http://httpbin.org/post', True)

        assert resp.headers['Link'] == MementoUtils.make_link('http://httpbin.org/post', 'original')
        assert resp.headers['Memento-Datetime'] != ''

        assert b'HTTP/1.1 200 OK' in resp.body
        assert b'"foo": "bar"' in resp.body

        assert 'ResErrors' not in resp.headers
예제 #25
0
    def test_live_resource(self):
        headers = {'foo': 'bar'}
        resp = self.testapp.get('/live/resource?url=http://httpbin.org/get?foo=bar', headers=headers)

        assert resp.headers['Warcserver-Source-Coll'] == 'live'

        self._check_uri_date(resp, 'http://httpbin.org/get?foo=bar', True)

        assert resp.headers['Link'] == MementoUtils.make_link('http://httpbin.org/get?foo=bar', 'original')
        assert resp.headers['Memento-Datetime'] != ''

        assert b'HTTP/1.1 200 OK' in resp.body
        assert b'"foo": "bar"' in resp.body

        assert 'ResErrors' not in resp.headers
예제 #26
0
    def test_live_post_resource(self):
        resp = self.testapp.post('/live/resource?url=http://httpbin.org/post',
                                 OrderedDict([('foo', 'bar')]))

        assert resp.headers['Warcserver-Source-Coll'] == 'live'

        self._check_uri_date(resp, 'http://httpbin.org/post', True)

        assert resp.headers['Link'] == MementoUtils.make_link('http://httpbin.org/post', 'original')
        assert resp.headers['Memento-Datetime'] != ''

        assert b'HTTP/1.1 200 OK' in resp.body
        assert b'"foo": "bar"' in resp.body

        assert 'ResErrors' not in resp.headers
예제 #27
0
    def test_live_resource(self):
        headers = {'foo': 'bar'}
        resp = self.testapp.get('/live/resource?url=http://httpbin.org/get?foo=bar', headers=headers)

        assert resp.headers['Warcserver-Source-Coll'] == 'live'

        self._check_uri_date(resp, 'http://httpbin.org/get?foo=bar', True)

        assert resp.headers['Link'] == MementoUtils.make_link('http://httpbin.org/get?foo=bar', 'original')
        assert resp.headers['Memento-Datetime'] != ''

        assert b'HTTP/1.1 200 OK' in resp.body
        assert b'"foo": "bar"' in resp.body

        assert 'ResErrors' not in resp.headers
예제 #28
0
    def test_live_video_loader(self):
        pytest.importorskip('youtube_dl')
        params = {'url': 'http://www.youtube.com/v/BfBgWtAIbRc',
                  'content_type': 'application/vnd.youtube-dl_formats+json'
                 }

        resp = self.testapp.get('/live/resource', params=params)

        assert resp.headers['Warcserver-Source-Coll'] == 'live'

        self._check_uri_date(resp, 'metadata://www.youtube.com/v/BfBgWtAIbRc', True)

        assert resp.headers['Link'] == MementoUtils.make_link('metadata://www.youtube.com/v/BfBgWtAIbRc', 'original')
        assert resp.headers['Memento-Datetime'] != ''

        assert b'WARC-Type: metadata' in resp.body
        assert b'Content-Type: application/vnd.youtube-dl_formats+json' in resp.body
예제 #29
0
    def test_live_video_loader(self):
        pytest.importorskip('youtube_dl')
        params = {'url': 'http://www.youtube.com/v/BfBgWtAIbRc',
                  'content_type': 'application/vnd.youtube-dl_formats+json'
                 }

        resp = self.testapp.get('/live/resource', params=params)

        assert resp.headers['Warcserver-Source-Coll'] == 'live'

        self._check_uri_date(resp, 'metadata://www.youtube.com/v/BfBgWtAIbRc', True)

        assert resp.headers['Link'] == MementoUtils.make_link('metadata://www.youtube.com/v/BfBgWtAIbRc', 'original')
        assert resp.headers['Memento-Datetime'] != ''

        assert b'WARC-Type: metadata' in resp.body
        assert b'Content-Type: application/vnd.youtube-dl_formats+json' in resp.body
예제 #30
0
    def test_agg_post_resolve_fallback(self):
        req_data = OrderedDict([('foo', 'bar'), ('test', 'abc')])

        resp = self.testapp.post('/fallback/resource?url=http://httpbin.org/post', req_data)

        assert resp.headers['Warcserver-Source-Coll'] == 'post'

        self._check_uri_date(resp, 'http://httpbin.org/post', True)

        assert resp.headers['Link'] == MementoUtils.make_link('http://httpbin.org/post', 'original')

        assert b'HTTP/1.1 200 OK' in resp.body
        assert b'"foo": "bar"' in resp.body
        assert b'"test": "abc"' in resp.body
        assert b'"url": "http://httpbin.org/post"' in resp.body

        assert 'ResErrors' not in resp.headers
예제 #31
0
    def test_agg_post_resolve_fallback(self):
        req_data = OrderedDict([('foo', 'bar'), ('test', 'abc')])

        resp = self.testapp.post('/fallback/resource?url=http://httpbin.org/post', req_data)

        assert resp.headers['Warcserver-Source-Coll'] == 'post'

        self._check_uri_date(resp, 'http://httpbin.org/post', True)

        assert resp.headers['Link'] == MementoUtils.make_link('http://httpbin.org/post', 'original')

        assert b'HTTP/1.1 200 OK' in resp.body
        assert b'"foo": "bar"' in resp.body
        assert b'"test": "abc"' in resp.body
        assert b'"url": "http://httpbin.org/post"' in resp.body

        assert 'ResErrors' not in resp.headers
예제 #32
0
    def test_agg_select_local_postreq(self):
        req_data = """\
GET / HTTP/1.1
Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8
User-agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36
Host: iana.org
"""

        resp = self.testapp.post('/many/resource/postreq?url=http://iana.org/&closest=20140126200624', req_data)

        assert resp.headers['Warcserver-Source-Coll'] == 'local:iana.cdxj'

        self._check_uri_date(resp, 'http://www.iana.org/', '2014-01-26T20:06:24Z')

        assert resp.headers['Link'] == MementoUtils.make_link('http://www.iana.org/', 'original')
        assert resp.headers['Memento-Datetime'] == 'Sun, 26 Jan 2014 20:06:24 GMT'

        assert json.loads(resp.headers['ResErrors']) == {"rhiz": "NotFoundException('http://webenact.rhizome.org/vvork/http://iana.org/',)"}
예제 #33
0
    def _test_warc_write(self, recorder_app, host, path, other_params='', link_url=''):
        url = 'http://' + host + path
        req_url = '/live/resource/postreq?url=' + url + other_params
        testapp = webtest.TestApp(recorder_app)
        resp = testapp.post(req_url, general_req_data.format(host=host, path=path).encode('utf-8'))

        if not recorder_app.write_queue.empty():
            recorder_app._write_one()

        assert resp.headers['Warcserver-Source-Coll'] == 'live'

        if not link_url:
            link_url = unquote(url)

        assert resp.headers['Link'] == MementoUtils.make_link(link_url, 'original')
        assert resp.headers['Memento-Datetime'] != ''

        return resp
예제 #34
0
    def test_agg_local_revisit(self):
        resp = self.testapp.get('/many/resource?url=http://www.example.com/&closest=20140127171251&sources=local')

        assert resp.headers['Warcserver-Source-Coll'] == 'local:dupes.cdxj'

        buff = BytesIO(resp.body)
        status_headers = StatusAndHeadersParser(['WARC/1.0']).parse(buff)
        assert status_headers.get_header('WARC-Target-URI') == 'http://example.com'
        assert status_headers.get_header('WARC-Date') == '2014-01-27T17:12:51Z'
        assert status_headers.get_header('WARC-Refers-To-Target-URI') == 'http://example.com'
        assert status_headers.get_header('WARC-Refers-To-Date') == '2014-01-27T17:12:00Z'

        assert resp.headers['Link'] == MementoUtils.make_link('http://example.com', 'original')
        assert resp.headers['Memento-Datetime'] == 'Mon, 27 Jan 2014 17:12:51 GMT'

        assert b'HTTP/1.1 200 OK' in resp.body
        assert b'<!doctype html>' in resp.body

        assert 'ResErrors' not in resp.headers
예제 #35
0
    def make_timemap(self, wb_url, res, full_prefix, output):
        wb_url.type = wb_url.QUERY

        content_type = res.headers.get('Content-Type')
        text = res.text

        if not res.text:
            status = '404 Not Found'

        elif res.status_code:
            status = str(res.status_code) + ' ' + res.reason

            if res.status_code == 200 and output == 'link':
                timegate, timemap = self._get_timegate_timemap(
                    wb_url.url, full_prefix, wb_url.mod)

                text = MementoUtils.wrap_timemap_header(
                    wb_url.url, timegate, timemap, res.text)
        return WbResponse.text_response(text,
                                        content_type=content_type,
                                        status=status)
예제 #36
0
    def test_agg_live_postreq(self):
        req_data = """\
GET /get?foo=bar HTTP/1.1
Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8
User-agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36
Host: httpbin.org
"""

        resp = self.testapp.post('/many/resource/postreq?url=http://httpbin.org/get?foo=bar&closest=now', req_data)

        assert resp.headers['Warcserver-Source-Coll'] == 'live'

        self._check_uri_date(resp, 'http://httpbin.org/get?foo=bar', True)

        assert resp.headers['Link'] == MementoUtils.make_link('http://httpbin.org/get?foo=bar', 'original')
        assert resp.headers['Memento-Datetime'] != ''

        assert b'HTTP/1.1 200 OK' in resp.body
        assert b'"foo": "bar"' in resp.body

        #assert json.loads(resp.headers['ResErrors']) == {"rhiz": "NotFoundException('http://webenact.rhizome.org/vvork/http://httpbin.org/get?foo=bar',)"}
        assert "NotFoundException('http://webenact.rhizome.org/vvork/" in json.loads(resp.headers['ResErrors'])['rhiz']
예제 #37
0
    def links_to_cdxobject(self, link_header, def_name):
        results = MementoUtils.parse_links(link_header, def_name)

        original = results['original']['url']
        key = canonicalize(original)

        mementos = results['mementos']

        for val in mementos:
            dt = val['datetime']
            ts = http_date_to_timestamp(dt)
            cdx = CDXObject()
            cdx['urlkey'] = key
            cdx['timestamp'] = ts
            cdx['url'] = original
            cdx['mem_rel'] = val.get('rel', '')
            cdx['memento_url'] = val['url']

            load_url = self._get_replay_url(cdx['timestamp'], original)

            cdx['load_url'] = load_url
            yield cdx
예제 #38
0
    def test_live_video_loader_post(self):
        pytest.importorskip('youtube_dl')
        req_data = """\
GET /v/BfBgWtAIbRc HTTP/1.1
accept-encoding: gzip, deflate
accept: */*
host: www.youtube.com\
"""

        params = {'url': 'http://www.youtube.com/v/BfBgWtAIbRc',
                  'content_type': 'application/vnd.youtube-dl_formats+json'
                 }

        resp = self.testapp.post('/live/resource/postreq?&' + urlencode(params), req_data)

        assert resp.headers['Warcserver-Source-Coll'] == 'live'

        self._check_uri_date(resp, 'metadata://www.youtube.com/v/BfBgWtAIbRc', True)

        assert resp.headers['Link'] == MementoUtils.make_link('metadata://www.youtube.com/v/BfBgWtAIbRc', 'original')
        assert resp.headers['Memento-Datetime'] != ''

        assert b'WARC-Type: metadata' in resp.body
        assert b'Content-Type: application/vnd.youtube-dl_formats+json' in resp.body
예제 #39
0
파일: handlers.py 프로젝트: solversa/pywb
def to_link(cdx_iter, fields, params):
    content_type = 'application/link-format'
    return content_type, MementoUtils.make_timemap(cdx_iter, params)
예제 #40
0
    def __call__(self, cdx, params):
        entry = self.load_resource(cdx, params)
        if not entry:
            return None, None

        compress = params.get('compress') == 'gzip'

        warc_headers, other_headers, stream = entry

        source = self._get_source_id(cdx)

        out_headers = {}
        out_headers['Warcserver-Type'] = 'warc'
        out_headers['Content-Type'] = 'application/warc-record'

        if params.get('recorder_skip'):
            out_headers['Recorder-Skip'] = '1'
            cdx['recorder_skip'] = '1'

        out_headers['Warcserver-Cdx'] = to_native_str(cdx.to_cdxj().rstrip())
        out_headers['Warcserver-Source-Coll'] = to_native_str(source)

        if not warc_headers:
            if other_headers:
                out_headers['Link'] = other_headers.get('Link')
                out_headers['Memento-Datetime'] = other_headers.get(
                    'Memento-Datetime')
                if not compress:
                    out_headers['Content-Length'] = other_headers.get(
                        'Content-Length')

            return out_headers, StreamIter(stream, closer=call_release_conn)

        target_uri = warc_headers.get_header('WARC-Target-URI')

        out_headers['WARC-Target-URI'] = target_uri

        out_headers['Link'] = MementoUtils.make_link(target_uri, 'original')

        memento_dt = iso_date_to_datetime(warc_headers.get_header('WARC-Date'))
        out_headers['Memento-Datetime'] = datetime_to_http_date(memento_dt)

        warc_headers_buff = warc_headers.to_bytes()

        if not compress:
            lenset = self._set_content_len(
                warc_headers.get_header('Content-Length'), out_headers,
                len(warc_headers_buff))
        else:
            lenset = False

        streamiter = StreamIter(stream,
                                header1=warc_headers_buff,
                                header2=other_headers,
                                closer=call_release_conn)

        if compress:
            streamiter = compress_gzip_iter(streamiter)
            out_headers['Content-Encoding'] = 'gzip'

        #if not lenset:
        #    out_headers['Transfer-Encoding'] = 'chunked'
        #    streamiter = chunk_encode_iter(streamiter)

        return out_headers, streamiter
예제 #41
0
    def render_content(self, wb_url, kwargs, environ):
        wb_url = wb_url.replace('#', '%23')
        wb_url = WbUrl(wb_url)

        proto = environ.get('HTTP_X_FORWARDED_PROTO', self.force_scheme)

        if proto:
            environ['wsgi.url_scheme'] = proto

        history_page = environ.pop('HTTP_X_WOMBAT_HISTORY_PAGE', '')
        if history_page:
            wb_url.url = history_page
            is_ajax = True
        else:
            is_ajax = self.is_ajax(environ)

        is_timegate = self._check_accept_dt(wb_url, environ)

        host_prefix = self.get_host_prefix(environ)
        rel_prefix = self.get_rel_prefix(environ)
        full_prefix = host_prefix + rel_prefix
        environ['pywb.host_prefix'] = host_prefix
        pywb_static_prefix = host_prefix + environ.get(
            'pywb.app_prefix', '') + environ.get('pywb.static_prefix',
                                                 '/static/')
        is_proxy = ('wsgiprox.proxy_host' in environ)

        response = self.handle_custom_response(environ, wb_url, full_prefix,
                                               host_prefix, kwargs)

        if response:
            return self.format_response(response, wb_url, full_prefix,
                                        is_timegate, is_proxy)

        if is_proxy:
            environ['pywb_proxy_magic'] = environ['wsgiprox.proxy_host']
            urlrewriter = IdentityUrlRewriter(wb_url, '')
            framed_replay = False

        else:
            urlrewriter = UrlRewriter(wb_url,
                                      prefix=full_prefix,
                                      full_prefix=full_prefix,
                                      rel_prefix=rel_prefix,
                                      pywb_static_prefix=pywb_static_prefix)

            framed_replay = self.framed_replay

        url_parts = urlsplit(wb_url.url)
        if not url_parts.path:
            return self.send_redirect('/', url_parts, urlrewriter)

        self.unrewrite_referrer(environ, full_prefix)

        urlkey = canonicalize(wb_url.url)

        if self.use_js_obj_proxy:
            content_rw = self.js_proxy_rw
        else:
            content_rw = self.default_rw

        inputreq = RewriteInputRequest(environ, urlkey, wb_url.url, content_rw)

        inputreq.include_method_query(wb_url.url)

        range_start, range_end, skip_record = self._check_range(
            inputreq, wb_url)

        setcookie_headers = None
        cookie_key = None
        if self.cookie_tracker:
            cookie_key = self.get_cookie_key(kwargs)
            if cookie_key:
                res = self.cookie_tracker.get_cookie_headers(
                    wb_url.url, urlrewriter, cookie_key,
                    environ.get('HTTP_COOKIE', ''))
                inputreq.extra_cookie, setcookie_headers = res

        r = self._do_req(inputreq, wb_url, kwargs, skip_record)

        if r.status_code >= 400:
            error = None
            try:
                error = r.raw.read()
            except Exception:
                pass
            finally:
                no_except_close(r.raw)

            if error:
                error = error.decode('utf-8')
            else:
                error = ''

            details = dict(args=kwargs, error=error)
            raise UpstreamException(r.status_code,
                                    url=wb_url.url,
                                    details=details)

        cdx = CDXObject(r.headers.get('Warcserver-Cdx').encode('utf-8'))

        cdx_url_parts = urlsplit(cdx['url'])

        if cdx_url_parts.path.endswith(
                '/') and not url_parts.path.endswith('/'):
            # add trailing slash
            new_path = url_parts.path + '/'

            no_except_close(r.raw)

            return self.send_redirect(new_path, url_parts, urlrewriter)

        stream = BufferedReader(r.raw, block_size=BUFF_SIZE)
        record = self.loader.parse_record_stream(stream,
                                                 ensure_http_headers=True)

        memento_dt = r.headers.get('Memento-Datetime')
        target_uri = r.headers.get('WARC-Target-URI')

        # cdx['urlkey'] = urlkey
        # cdx['timestamp'] = http_date_to_timestamp(memento_dt)
        # cdx['url'] = target_uri

        set_content_loc = False

        # Check if Fuzzy Match
        if target_uri != wb_url.url and cdx.get('is_fuzzy') == '1':
            set_content_loc = True

        # if redir to exact, redir if url or ts are different
        if self.redirect_to_exact:
            if (set_content_loc or (wb_url.timestamp != cdx.get('timestamp')
                                    and not cdx.get('is_live'))):

                new_url = urlrewriter.get_new_url(url=target_uri,
                                                  timestamp=cdx['timestamp'],
                                                  mod=wb_url.mod)

                resp = WbResponse.redir_response(new_url,
                                                 '307 Temporary Redirect')
                if self.enable_memento:
                    if is_timegate and not is_proxy:
                        self._add_memento_links(target_uri, full_prefix,
                                                memento_dt, cdx['timestamp'],
                                                resp.status_headers,
                                                is_timegate, is_proxy)

                    else:
                        resp.status_headers['Link'] = MementoUtils.make_link(
                            target_uri, 'original')

                return resp

        self._add_custom_params(cdx, r.headers, kwargs, record)

        if self._add_range(record, wb_url, range_start, range_end):
            wb_url.mod = 'id_'

        if is_ajax:
            head_insert_func = None
            urlrewriter.rewrite_opts['is_ajax'] = True
        else:
            top_url = self.get_top_url(full_prefix, wb_url, cdx, kwargs)
            head_insert_func = (self.head_insert_view.create_insert_func(
                wb_url,
                full_prefix,
                host_prefix,
                top_url,
                environ,
                framed_replay,
                coll=kwargs.get('coll', ''),
                replay_mod=self.replay_mod,
                config=self.config))

        cookie_rewriter = None
        if self.cookie_tracker and cookie_key:
            # skip add cookie if service worker is not 200
            # it seems cookie headers from service workers are not applied, so don't update in cache
            if wb_url.mod == 'sw_':
                cookie_key = None

            cookie_rewriter = self.cookie_tracker.get_rewriter(
                urlrewriter, cookie_key)

        urlrewriter.rewrite_opts['ua_string'] = environ.get('HTTP_USER_AGENT')

        result = content_rw(record, urlrewriter, cookie_rewriter,
                            head_insert_func, cdx, environ)

        status_headers, gen, is_rw = result

        if history_page:
            title = DefaultRewriter._extract_title(gen)
            if not title:
                title = unquote(environ.get('HTTP_X_WOMBAT_HISTORY_TITLE', ''))

            if not title:
                title = history_page

            self._add_history_page(cdx, kwargs, title)
            return WbResponse.json_response({'title': title})

        if setcookie_headers:
            status_headers.headers.extend(setcookie_headers)

        if ' ' not in status_headers.statusline:
            status_headers.statusline += ' None'

        if not is_ajax and self.enable_memento:
            self._add_memento_links(cdx['url'], full_prefix, memento_dt,
                                    cdx['timestamp'], status_headers,
                                    is_timegate, is_proxy,
                                    cdx.get('source-coll'))

            set_content_loc = True

        if set_content_loc and not self.redirect_to_exact:
            status_headers.headers.append(
                ('Content-Location',
                 urlrewriter.get_new_url(timestamp=cdx['timestamp'],
                                         url=cdx['url'])))
        if not is_proxy:
            self.add_csp_header(wb_url, status_headers)

        response = WbResponse(status_headers, gen)

        return response
예제 #42
0
    def __call__(self, cdx, params):
        entry = self.load_resource(cdx, params)
        if not entry:
            return None, None

        compress = params.get('compress') == 'gzip'

        warc_headers, other_headers, stream = entry

        source = self._get_source_id(cdx)

        out_headers = {}
        out_headers['Warcserver-Type'] = 'warc'
        out_headers['Content-Type'] = 'application/warc-record'

        if params.get('recorder_skip'):
            out_headers['Recorder-Skip'] = '1'
            cdx['recorder_skip'] = '1'

        out_headers['Warcserver-Cdx'] = to_native_str(cdx.to_cdxj().rstrip())
        out_headers['Warcserver-Source-Coll'] = to_native_str(source)

        if not warc_headers:
            if other_headers:
                out_headers['Link'] = other_headers.get('Link')
                out_headers['Memento-Datetime'] = other_headers.get('Memento-Datetime')
                if not compress:
                    out_headers['Content-Length'] = other_headers.get('Content-Length')

            return out_headers, StreamIter(stream, closer=call_release_conn)

        target_uri = warc_headers.get_header('WARC-Target-URI')

        out_headers['WARC-Target-URI'] = target_uri

        out_headers['Link'] = MementoUtils.make_link(target_uri, 'original')

        memento_dt = iso_date_to_datetime(warc_headers.get_header('WARC-Date'))
        out_headers['Memento-Datetime'] = datetime_to_http_date(memento_dt)

        warc_headers_buff = warc_headers.to_bytes()

        if not compress:
            lenset = self._set_content_len(warc_headers.get_header('Content-Length'),
                                         out_headers,
                                         len(warc_headers_buff))
        else:
            lenset = False

        streamiter = StreamIter(stream,
                                header1=warc_headers_buff,
                                header2=other_headers,
                                closer=call_release_conn)

        if compress:
            streamiter = compress_gzip_iter(streamiter)
            out_headers['Content-Encoding'] = 'gzip'

        #if not lenset:
        #    out_headers['Transfer-Encoding'] = 'chunked'
        #    streamiter = chunk_encode_iter(streamiter)

        return out_headers, streamiter
예제 #43
0
    def _add_memento_links(self,
                           url,
                           full_prefix,
                           memento_dt,
                           memento_ts,
                           status_headers,
                           is_timegate,
                           is_proxy,
                           coll=None,
                           pref_applied=None,
                           mod=None,
                           is_memento=True):
        """Adds the memento link headers to supplied StatusAndHeaders instance

        :param str url: The URI-R being rewritten
        :param str full_prefix: The replay prefix
        :param str|None memento_dt: The memento datetime for the URI-R being rewritten
        :param str memento_ts: The memento timestamp
        :param warcio.StatusAndHeaders status_headers:
        :param bool is_timegate: Are we returning a response for a timegate
        :param bool is_proxy: Are we operating in proxy mode
        :param str|None coll: The collection the URI-R is from
        :param str|None pref_applied:
        :param str|None mod: The rewrite modifier
        :param bool is_memento:
        :rtype: None
        """

        replay_mod = mod or self.replay_mod

        # memento url + header
        if not memento_dt and memento_ts:
            memento_dt = timestamp_to_http_date(memento_ts)

        if memento_dt:
            if is_memento:
                status_headers.headers.append(('Memento-Datetime', memento_dt))

            if is_proxy:
                memento_url = url
            else:
                memento_url = full_prefix + memento_ts + replay_mod
                memento_url += '/' + url
        else:
            memento_url = None

        timegate_url, timemap_url = self._get_timegate_timemap(
            url, full_prefix, mod)

        link = []
        if not is_proxy:
            link.append(MementoUtils.make_link(url, 'original'))
            link.append(MementoUtils.make_link(timegate_url, 'timegate'))
            link.append(MementoUtils.make_link(timemap_url, 'timemap'))

        if memento_dt:
            link.append(
                MementoUtils.make_memento_link(memento_url, 'memento',
                                               memento_dt, coll))

        link_str = ', '.join(link)

        status_headers.headers.append(('Link', link_str))

        vary = ''
        if is_timegate:
            vary = 'accept-datetime'

        if pref_applied:
            vary = 'Prefer' if not vary else vary + ', Prefer'
            status_headers.headers.append(('Preference-Applied', pref_applied))

        if vary:
            status_headers.headers.append(('Vary', vary))
예제 #44
0
    def render_content(self, wb_url, kwargs, environ):
        wb_url = wb_url.replace('#', '%23')
        wb_url = WbUrl(wb_url)
        is_timegate = self._check_accept_dt(wb_url, environ)

        host_prefix = self.get_host_prefix(environ)
        rel_prefix = self.get_rel_prefix(environ)
        full_prefix = host_prefix + rel_prefix

        is_proxy = ('wsgiprox.proxy_host' in environ)

        response = self.handle_custom_response(environ, wb_url,
                                               full_prefix, host_prefix,
                                               kwargs)

        if response:
            return self.format_response(response, wb_url, full_prefix, is_timegate, is_proxy)

        if is_proxy:
            environ['pywb_proxy_magic'] = environ['wsgiprox.proxy_host']
            urlrewriter = IdentityUrlRewriter(wb_url, '')
            framed_replay = False

        else:
            urlrewriter = UrlRewriter(wb_url,
                                      prefix=full_prefix,
                                      full_prefix=full_prefix,
                                      rel_prefix=rel_prefix)

            framed_replay = self.framed_replay

        url_parts = urlsplit(wb_url.url)
        if not url_parts.path:
            scheme, netloc, path, query, frag = url_parts
            path = '/'
            url = urlunsplit((scheme, netloc, path, query, frag))
            resp = WbResponse.redir_response(urlrewriter.rewrite(url),
                                             '307 Temporary Redirect')

            if self.enable_memento:
                resp.status_headers['Link'] = MementoUtils.make_link(url, 'original')

            return resp

        self.unrewrite_referrer(environ, full_prefix)

        urlkey = canonicalize(wb_url.url)

        if self.use_js_obj_proxy:
            content_rw = self.js_proxy_rw
        else:
            content_rw = self.default_rw

        inputreq = RewriteInputRequest(environ, urlkey, wb_url.url, content_rw)

        inputreq.include_method_query(wb_url.url)

        range_start, range_end, skip_record = self._check_range(inputreq, wb_url)

        setcookie_headers = None
        if self.cookie_tracker:
            cookie_key = self.get_cookie_key(kwargs)
            res = self.cookie_tracker.get_cookie_headers(wb_url.url, urlrewriter, cookie_key)
            inputreq.extra_cookie, setcookie_headers = res

        r = self._do_req(inputreq, wb_url, kwargs, skip_record)

        if r.status_code >= 400:
            error = None
            try:
                error = r.raw.read()
                r.raw.close()
            except:
                pass

            if error:
                error = error.decode('utf-8')
            else:
                error = ''

            details = dict(args=kwargs, error=error)
            raise UpstreamException(r.status_code, url=wb_url.url, details=details)

        stream = BufferedReader(r.raw, block_size=BUFF_SIZE)
        record = self.loader.parse_record_stream(stream,
                                                 ensure_http_headers=True)

        memento_dt = r.headers.get('Memento-Datetime')
        target_uri = r.headers.get('WARC-Target-URI')

        cdx = CDXObject(r.headers.get('Warcserver-Cdx').encode('utf-8'))

        #cdx['urlkey'] = urlkey
        #cdx['timestamp'] = http_date_to_timestamp(memento_dt)
        #cdx['url'] = target_uri

        set_content_loc = False

        # Check if Fuzzy Match
        if target_uri != wb_url.url and cdx.get('is_fuzzy') == '1':
            set_content_loc = True

        # if redir to exact, redir if url or ts are different
        if self.redirect_to_exact:
            if (set_content_loc or
                (wb_url.timestamp != cdx.get('timestamp') and not cdx.get('is_live'))):

                new_url = urlrewriter.get_new_url(url=target_uri,
                                                  timestamp=cdx['timestamp'],
                                                  mod=wb_url.mod)

                resp = WbResponse.redir_response(new_url, '307 Temporary Redirect')
                if self.enable_memento:
                    if is_timegate and not is_proxy:
                        self._add_memento_links(target_uri, full_prefix,
                                                memento_dt, cdx['timestamp'],
                                                resp.status_headers,
                                                is_timegate, is_proxy)

                    else:
                        resp.status_headers['Link'] = MementoUtils.make_link(target_uri, 'original')

                return resp

        self._add_custom_params(cdx, r.headers, kwargs)

        if self._add_range(record, wb_url, range_start, range_end):
            wb_url.mod = 'id_'

        is_ajax = self.is_ajax(environ)

        if is_ajax:
            head_insert_func = None
            urlrewriter.rewrite_opts['is_ajax'] = True
        else:
            top_url = self.get_top_url(full_prefix, wb_url, cdx, kwargs)
            head_insert_func = (self.head_insert_view.
                                    create_insert_func(wb_url,
                                                       full_prefix,
                                                       host_prefix,
                                                       top_url,
                                                       environ,
                                                       framed_replay,
                                                       config=self.config))

        cookie_rewriter = None
        if self.cookie_tracker:
            cookie_rewriter = self.cookie_tracker.get_rewriter(urlrewriter,
                                                               cookie_key)

        urlrewriter.rewrite_opts['ua_string'] = environ.get('HTTP_USER_AGENT')

        result = content_rw(record, urlrewriter, cookie_rewriter, head_insert_func, cdx)

        status_headers, gen, is_rw = result

        if setcookie_headers:
            status_headers.headers.extend(setcookie_headers)

        if ' ' not in status_headers.statusline:
            status_headers.statusline += ' None'

        if not is_ajax and self.enable_memento:
            self._add_memento_links(cdx['url'], full_prefix,
                                    memento_dt, cdx['timestamp'], status_headers,
                                    is_timegate, is_proxy, cdx.get('source-coll'))

            set_content_loc = True

        if set_content_loc and not self.redirect_to_exact:
            status_headers.headers.append(('Content-Location', urlrewriter.get_new_url(timestamp=cdx['timestamp'],
                                                                                       url=cdx['url'])))
        if not is_proxy:
            self.add_csp_header(wb_url, status_headers)

        response = WbResponse(status_headers, gen)

        return response
예제 #45
0
    def render_content(self, wb_url, kwargs, environ):
        wb_url = wb_url.replace('#', '%23')
        wb_url = WbUrl(wb_url)

        history_page = environ.pop('HTTP_X_WOMBAT_HISTORY_PAGE', '')
        if history_page:
            wb_url.url = history_page
            is_ajax = True
        else:
            is_ajax = self.is_ajax(environ)

        is_timegate = self._check_accept_dt(wb_url, environ)

        self.prepare_env(environ)

        host_prefix = environ['pywb.host_prefix']
        rel_prefix = self.get_rel_prefix(environ)
        full_prefix = host_prefix + rel_prefix

        pywb_static_prefix = environ['pywb.static_prefix'] + '/'
        is_proxy = ('wsgiprox.proxy_host' in environ)

        # if OPTIONS in proxy mode, just generate the proxy responss
        if is_proxy and self.is_preflight(environ):
            return WbResponse.options_response(environ)

        if self.use_js_obj_proxy:
            content_rw = self.js_proxy_rw
        else:
            content_rw = self.default_rw

        # no redirects if in proxy
        redirect_to_exact = self.redirect_to_exact and not is_proxy

        # Check Prefer
        pref_mod, pref_applied = self._get_prefer_mod(wb_url, environ,
                                                      content_rw, is_proxy)

        response = None
        keep_frame_response = False

        # prefer overrides custom response?
        if pref_mod is not None:
            # fast-redirect to preferred
            if redirect_to_exact and not is_timegate and pref_mod != wb_url.mod:
                new_url = full_prefix + wb_url.to_str(mod=pref_mod)
                headers = [('Preference-Applied', pref_applied),
                           ('Vary', 'Prefer')]

                return WbResponse.redir_response(new_url,
                                                 '307 Temporary Redirect',
                                                 headers=headers)
            else:
                wb_url.mod = pref_mod
        else:
            if kwargs.get('output'):
                response = self.handle_timemap(wb_url, kwargs, full_prefix)

            elif wb_url.is_query():
                response = self.handle_query(environ, wb_url, kwargs,
                                             full_prefix)

            else:
                response = self.handle_custom_response(environ, wb_url,
                                                       full_prefix,
                                                       host_prefix, kwargs)

                keep_frame_response = (not kwargs.get('no_timegate_check')
                                       and is_timegate
                                       and not is_proxy) or redirect_to_exact

        if response and not keep_frame_response:
            return self.format_response(response, wb_url, full_prefix,
                                        is_timegate, is_proxy)

        if is_proxy:
            environ['pywb_proxy_magic'] = environ['wsgiprox.proxy_host']
            urlrewriter = IdentityUrlRewriter(wb_url, '')
            framed_replay = False

        else:
            urlrewriter = UrlRewriter(wb_url,
                                      prefix=full_prefix,
                                      full_prefix=full_prefix,
                                      rel_prefix=rel_prefix,
                                      pywb_static_prefix=pywb_static_prefix)

            framed_replay = self.framed_replay

        url_parts = urlsplit(wb_url.url)
        if not url_parts.path:
            return self.send_redirect('/', url_parts, urlrewriter)

        self.unrewrite_referrer(environ, full_prefix)

        urlkey = canonicalize(wb_url.url)

        inputreq = RewriteInputRequest(environ, urlkey, wb_url.url, content_rw)

        inputreq.include_method_query(wb_url.url)

        range_start, range_end, skip_record = self._check_range(
            inputreq, wb_url)

        setcookie_headers = None
        cookie_key = None
        if self.cookie_tracker:
            cookie_key = self.get_cookie_key(kwargs)
            if cookie_key:
                res = self.cookie_tracker.get_cookie_headers(
                    wb_url.url, urlrewriter, cookie_key,
                    environ.get('HTTP_COOKIE', ''))
                inputreq.extra_cookie, setcookie_headers = res

        r = self._do_req(inputreq, wb_url, kwargs, skip_record)

        if r.status_code >= 400:
            error = None
            try:
                error = r.raw.read()
            except Exception:
                pass
            finally:
                no_except_close(r.raw)

            if error:
                error = error.decode('utf-8')
            else:
                error = ''

            details = dict(args=kwargs, error=error)
            if r.status_code == 404:
                raise NotFoundException(url=wb_url.url, msg=details)

            else:
                raise UpstreamException(r.status_code,
                                        url=wb_url.url,
                                        details=details)

        cdx = CDXObject(r.headers.get('Warcserver-Cdx').encode('utf-8'))

        cdx_url_parts = urlsplit(cdx['url'])

        if cdx_url_parts.path.endswith(
                '/') and not url_parts.path.endswith('/'):
            # add trailing slash
            new_path = url_parts.path + '/'

            no_except_close(r.raw)

            return self.send_redirect(new_path, url_parts, urlrewriter)

        # only redirect to exact if not live, otherwise set to false
        redirect_to_exact = redirect_to_exact and not cdx.get('is_live')

        # return top-frame timegate response, with timestamp from cdx
        if response and keep_frame_response and (not redirect_to_exact
                                                 or not is_timegate):
            no_except_close(r.raw)
            return self.format_response(response, wb_url, full_prefix,
                                        is_timegate, is_proxy,
                                        cdx['timestamp'])

        stream = BufferedReader(r.raw, block_size=BUFF_SIZE)
        record = self.loader.parse_record_stream(stream,
                                                 ensure_http_headers=True)

        memento_dt = r.headers.get('Memento-Datetime')
        target_uri = r.headers.get('WARC-Target-URI')

        # cdx['urlkey'] = urlkey
        # cdx['timestamp'] = http_date_to_timestamp(memento_dt)
        # cdx['url'] = target_uri

        set_content_loc = False

        # Check if Fuzzy Match
        if target_uri != wb_url.url and cdx.get('is_fuzzy') == '1':
            set_content_loc = True

        # if redirect to exact timestamp (only set if not live)
        if redirect_to_exact:
            if set_content_loc or is_timegate or wb_url.timestamp != cdx.get(
                    'timestamp'):
                new_url = urlrewriter.get_new_url(url=target_uri,
                                                  timestamp=cdx['timestamp'],
                                                  mod=wb_url.mod)

                resp = WbResponse.redir_response(new_url,
                                                 '307 Temporary Redirect')
                if self.enable_memento:
                    if is_timegate and not is_proxy:
                        self._add_memento_links(target_uri,
                                                full_prefix,
                                                memento_dt,
                                                cdx['timestamp'],
                                                resp.status_headers,
                                                is_timegate,
                                                is_proxy,
                                                pref_applied=pref_applied,
                                                mod=pref_mod,
                                                is_memento=False)

                    else:
                        resp.status_headers['Link'] = MementoUtils.make_link(
                            target_uri, 'original')

                return resp

        self._add_custom_params(cdx, r.headers, kwargs, record)

        if self._add_range(record, wb_url, range_start, range_end):
            wb_url.mod = 'id_'

        if is_ajax:
            head_insert_func = None
            urlrewriter.rewrite_opts['is_ajax'] = True
        else:
            top_url = self.get_top_url(full_prefix, wb_url, cdx, kwargs)
            head_insert_func = (self.head_insert_view.create_insert_func(
                wb_url,
                full_prefix,
                host_prefix,
                top_url,
                environ,
                framed_replay,
                coll=kwargs.get('coll', ''),
                replay_mod=self.replay_mod,
                metadata=kwargs.get('metadata', {}),
                config=self.config))

        cookie_rewriter = None
        if self.cookie_tracker and cookie_key:
            # skip add cookie if service worker is not 200
            # it seems cookie headers from service workers are not applied, so don't update in cache
            if wb_url.mod == 'sw_':
                cookie_key = None

            cookie_rewriter = self.cookie_tracker.get_rewriter(
                urlrewriter, cookie_key)

        urlrewriter.rewrite_opts['ua_string'] = environ.get('HTTP_USER_AGENT')

        result = content_rw(record, urlrewriter, cookie_rewriter,
                            head_insert_func, cdx, environ)

        status_headers, gen, is_rw = result

        if history_page:
            title = DefaultRewriter._extract_title(gen)
            if not title:
                title = unquote(environ.get('HTTP_X_WOMBAT_HISTORY_TITLE', ''))

            if not title:
                title = history_page

            self._add_history_page(cdx, kwargs, title)
            return WbResponse.json_response({'title': title})

        if setcookie_headers:
            status_headers.headers.extend(setcookie_headers)

        if ' ' not in status_headers.statusline:
            status_headers.statusline += ' None'

        if not is_ajax and self.enable_memento:
            self._add_memento_links(cdx['url'],
                                    full_prefix,
                                    memento_dt,
                                    cdx['timestamp'],
                                    status_headers,
                                    is_timegate,
                                    is_proxy,
                                    cdx.get('source-coll'),
                                    mod=pref_mod,
                                    pref_applied=pref_applied)

            set_content_loc = True

        if set_content_loc and not redirect_to_exact and not is_proxy:
            status_headers.headers.append(
                ('Content-Location',
                 urlrewriter.get_new_url(timestamp=cdx['timestamp'],
                                         url=cdx['url'])))

        if not is_proxy:
            self.add_csp_header(wb_url, status_headers)

        response = WbResponse(status_headers, gen)

        if is_proxy and environ.get('HTTP_ORIGIN'):
            response.add_access_control_headers(environ)

        if r.status_code == 200 and kwargs.get(
                'cache') == 'always' and environ.get('HTTP_REFERER'):
            response.status_headers[
                'Cache-Control'] = 'public, max-age=31536000, immutable'

        return response
예제 #46
0
파일: handlers.py 프로젝트: ikreymer/pywb
def to_link(cdx_iter, fields):
    content_type = 'application/link-format'
    return content_type, MementoUtils.make_timemap(cdx_iter)