示例#1
0
def do_deprefix(url, rel_prefix, full_prefix):
    encoded = urllib.quote_plus(full_prefix)
    url = url.replace(full_prefix, encoded)

    rewriter = UrlRewriter(url, rel_prefix, full_prefix)
    url = rewriter.deprefix_url()
    return urllib.unquote_plus(url)
示例#2
0
def do_deprefix(url, rel_prefix, full_prefix):
    encoded = urllib.quote_plus(full_prefix)
    url = url.replace(full_prefix, encoded)

    rewriter = UrlRewriter(url, rel_prefix, full_prefix)
    url = rewriter.deprefix_url()
    return urllib.unquote_plus(url)
示例#3
0
    def setup_class(cls):
        urlrewriter = UrlRewriter(
            '20161226/http://example.com/?callback=jQuery_ABC', '/web/',
            'https://localhost/web/')
        cls.rewriter = JSONPRewriter(urlrewriter)

        urlrewriter = UrlRewriter('20161226/http://example.com/', '/web/',
                                  'https://localhost/web/')
        cls.rewriter_missing_cb = JSONPRewriter(urlrewriter)
示例#4
0
def parse(html_text, is_ajax=False):
    urlrewriter = UrlRewriter(
        '20131226101010/https://example.com/some/path.html', '/web/')

    if is_ajax:
        urlrewriter.rewrite_opts['is_ajax'] = True

    rewriter = HTMLInsertOnlyRewriter(urlrewriter, head_insert='<!--Insert-->')

    return rewriter.rewrite(html_text) + rewriter.final_read()
示例#5
0
def _test_proxy_headers(http_cache=None):
    headers = _make_cache_headers()
    status = '200 OK'
    rewriter = UrlRewriter('20131010/http://example.com/', '/pywb/',
                           rewrite_opts={'http_cache': http_cache})

    rewritten = headerrewriter.rewrite(StatusAndHeaders(status, headers),
                                       rewriter,
                                       rewriter.get_cookie_rewriter())
    return rewritten.status_headers
示例#6
0
def _test_proxy_headers(http_cache=None):
    headers = _make_cache_headers()
    status = '200 OK'
    rewriter = UrlRewriter('20131010/http://example.com/', '/pywb/',
                           rewrite_opts={'http_cache': http_cache})

    rewritten = headerrewriter.rewrite(StatusAndHeaders(status, headers),
                                       rewriter,
                                       rewriter.get_cookie_rewriter())
    return rewritten.status_headers
    def rewrite_record(self, headers, content, ts, url='http://example.com/',
                       prefix='http://localhost:8080/prefix/', warc_headers=None,
                       request_url=None, is_live=None, use_js_proxy=True, environ=None):

        record = self._create_response_record(url, headers, content, warc_headers)

        wburl = WbUrl(ts + '/' + (request_url or url))
        url_rewriter = UrlRewriter(wburl, prefix)

        cdx = CDXObject()
        cdx['url'] = url
        cdx['timestamp'] = ts
        cdx['urlkey'] = canonicalize(url)
        if request_url != url:
            cdx['is_fuzzy'] = '1'
        cdx['is_live'] = is_live

        def insert_func(rule, cdx):
            return ''

        if use_js_proxy:
            rewriter = self.js_proxy_content_rewriter
        else:
            rewriter = self.content_rewriter

        return rewriter(record, url_rewriter, cookie_rewriter=None,
                        head_insert_func=insert_func,
                        cdx=cdx,
                        environ=environ)
示例#8
0
    def rewrite_record(self,
                       headers,
                       content,
                       ts,
                       url='http://example.com/',
                       prefix='http://localhost:8080/prefix/',
                       warc_headers=None,
                       request_url=None,
                       is_live=None):

        record = self._create_response_record(url, headers, content,
                                              warc_headers)

        wburl = WbUrl(ts + '/' + (request_url or url))
        url_rewriter = UrlRewriter(wburl, prefix)

        cdx = CDXObject()
        cdx['url'] = url
        cdx['timestamp'] = ts
        cdx['urlkey'] = canonicalize(url)
        if request_url != url:
            cdx['is_fuzzy'] = '1'
        cdx['is_live'] = is_live

        return self.content_rewriter(record, url_rewriter, None, cdx=cdx)
def parse(html_text):
    urlrewriter = UrlRewriter(
        '20131226101010/https://example.com/some/path.html', '/web/')

    rewriter = HTMLInsertOnlyRewriter(urlrewriter, head_insert='<!--Insert-->')

    return rewriter.rewrite(html_text) + rewriter.final_read()
示例#10
0
def do_rewrite(rel_url, base_url, prefix, mod=None, full_prefix=None):
    rewriter = UrlRewriter(base_url, prefix, full_prefix=full_prefix)
    return rewriter.rewrite(rel_url, mod)
示例#11
0
    def render_content(self, wb_url, kwargs, environ):
        wb_url = WbUrl(wb_url)

        host_prefix = self.get_host_prefix(environ)
        rel_prefix = self.get_rel_prefix(environ)
        full_prefix = host_prefix + rel_prefix

        resp = self.handle_custom_response(environ, wb_url,
                                           full_prefix, host_prefix, kwargs)
        if resp is not None:
            content_type = 'text/html'

            # if not replay outer frame, specify utf-8 charset
            if not self.is_framed_replay(wb_url):
                content_type += '; charset=utf-8'

            return WbResponse.text_response(resp, content_type=content_type)

        urlrewriter = UrlRewriter(wb_url,
                                  prefix=full_prefix,
                                  full_prefix=full_prefix,
                                  rel_prefix=rel_prefix)

        self.unrewrite_referrer(environ)

        urlkey = canonicalize(wb_url.url)

        inputreq = RewriteInputRequest(environ, urlkey, wb_url.url,
                                       self.content_rewriter)

        inputreq.include_post_query(wb_url.url)

        mod_url = None
        use_206 = False
        rangeres = None

        readd_range = False
        async_record_url = None

        if kwargs.get('type') in ('record', 'patch'):
            rangeres = inputreq.extract_range()

            if rangeres:
                mod_url, start, end, use_206 = rangeres

                # if bytes=0- Range request,
                # simply remove the range and still proxy
                if start == 0 and not end and use_206:
                    wb_url.url = mod_url
                    inputreq.url = mod_url

                    del environ['HTTP_RANGE']
                    readd_range = True
                else:
                    async_record_url = mod_url

        skip = async_record_url is not None

        setcookie_headers = None
        if self.cookie_tracker:
            cookie_key = self.get_cookie_key(kwargs)
            res = self.cookie_tracker.get_cookie_headers(wb_url.url, cookie_key)
            inputreq.extra_cookie, setcookie_headers = res

        r = self._do_req(inputreq, wb_url, kwargs, skip)

        if r.status_code >= 400:
            error = None
            try:
                error = r.raw.read()
                r.raw.close()
            except:
                pass

            if error:
                error = error.decode('utf-8')
            else:
                error = ''

            details = dict(args=kwargs, error=error)
            raise UpstreamException(r.status_code, url=wb_url.url, details=details)

        if async_record_url:
            environ.pop('HTTP_RANGE', '')
            gevent.spawn(self._do_async_req,
                         inputreq,
                         async_record_url,
                         wb_url,
                         kwargs,
                         False)

        record = self.loader.parse_record_stream(r.raw)

        cdx = CDXObject()
        cdx['urlkey'] = urlkey
        cdx['timestamp'] = http_date_to_timestamp(r.headers.get('Memento-Datetime'))
        cdx['url'] = wb_url.url

        self._add_custom_params(cdx, r.headers, kwargs)

        if readd_range:
            content_length = (record.status_headers.
                              get_header('Content-Length'))
            try:
                content_length = int(content_length)
                record.status_headers.add_range(0, content_length,
                                                   content_length)
            except (ValueError, TypeError):
                pass

        if self.is_ajax(environ):
            head_insert_func = None
            urlrewriter.rewrite_opts['is_ajax'] = True
        else:
            top_url = self.get_top_url(full_prefix, wb_url, cdx, kwargs)
            head_insert_func = (self.head_insert_view.
                                    create_insert_func(wb_url,
                                                       full_prefix,
                                                       host_prefix,
                                                       top_url,
                                                       environ,
                                                       self.framed_replay))

        cookie_rewriter = None
        if self.cookie_tracker:
            cookie_rewriter = self.cookie_tracker.get_rewriter(urlrewriter,
                                                               cookie_key)

        result = self.content_rewriter.rewrite_content(urlrewriter,
                                               record.status_headers,
                                               record.stream,
                                               head_insert_func,
                                               urlkey,
                                               cdx,
                                               cookie_rewriter,
                                               environ)

        status_headers, gen, is_rw = result

        if setcookie_headers:
            status_headers.headers.extend(setcookie_headers)

        return WbResponse(status_headers, gen)
示例#12
0
 def setup_class(cls):
     cls.urlrewriter = UrlRewriter('20171226/http://example.com/', '/warc/')
     cls.default_rewriter = DefaultRewriter()
示例#13
0
    def render_content(self, wb_url, kwargs, environ):
        wb_url = wb_url.replace('#', '%23')
        wb_url = WbUrl(wb_url)
        is_timegate = self._check_accept_dt(wb_url, environ)

        host_prefix = self.get_host_prefix(environ)
        rel_prefix = self.get_rel_prefix(environ)
        full_prefix = host_prefix + rel_prefix

        is_proxy = ('wsgiprox.proxy_host' in environ)

        response = self.handle_custom_response(environ, wb_url,
                                               full_prefix, host_prefix,
                                               kwargs)

        if response:
            return self.format_response(response, wb_url, full_prefix, is_timegate, is_proxy)

        if is_proxy:
            environ['pywb_proxy_magic'] = environ['wsgiprox.proxy_host']
            urlrewriter = IdentityUrlRewriter(wb_url, '')
            framed_replay = False

        else:
            urlrewriter = UrlRewriter(wb_url,
                                      prefix=full_prefix,
                                      full_prefix=full_prefix,
                                      rel_prefix=rel_prefix)

            framed_replay = self.framed_replay

        url_parts = urlsplit(wb_url.url)
        if not url_parts.path:
            scheme, netloc, path, query, frag = url_parts
            path = '/'
            url = urlunsplit((scheme, netloc, path, query, frag))
            resp = WbResponse.redir_response(urlrewriter.rewrite(url),
                                             '307 Temporary Redirect')

            if self.enable_memento:
                resp.status_headers['Link'] = MementoUtils.make_link(url, 'original')

            return resp

        self.unrewrite_referrer(environ, full_prefix)

        urlkey = canonicalize(wb_url.url)

        if self.use_js_obj_proxy:
            content_rw = self.js_proxy_rw
        else:
            content_rw = self.default_rw

        inputreq = RewriteInputRequest(environ, urlkey, wb_url.url, content_rw)

        inputreq.include_method_query(wb_url.url)

        range_start, range_end, skip_record = self._check_range(inputreq, wb_url)

        setcookie_headers = None
        if self.cookie_tracker:
            cookie_key = self.get_cookie_key(kwargs)
            res = self.cookie_tracker.get_cookie_headers(wb_url.url, urlrewriter, cookie_key)
            inputreq.extra_cookie, setcookie_headers = res

        r = self._do_req(inputreq, wb_url, kwargs, skip_record)

        if r.status_code >= 400:
            error = None
            try:
                error = r.raw.read()
                r.raw.close()
            except:
                pass

            if error:
                error = error.decode('utf-8')
            else:
                error = ''

            details = dict(args=kwargs, error=error)
            raise UpstreamException(r.status_code, url=wb_url.url, details=details)

        stream = BufferedReader(r.raw, block_size=BUFF_SIZE)
        record = self.loader.parse_record_stream(stream,
                                                 ensure_http_headers=True)

        memento_dt = r.headers.get('Memento-Datetime')
        target_uri = r.headers.get('WARC-Target-URI')

        cdx = CDXObject(r.headers.get('Warcserver-Cdx').encode('utf-8'))

        #cdx['urlkey'] = urlkey
        #cdx['timestamp'] = http_date_to_timestamp(memento_dt)
        #cdx['url'] = target_uri

        set_content_loc = False

        # Check if Fuzzy Match
        if target_uri != wb_url.url and cdx.get('is_fuzzy') == '1':
            set_content_loc = True

        # if redir to exact, redir if url or ts are different
        if self.redirect_to_exact:
            if (set_content_loc or
                (wb_url.timestamp != cdx.get('timestamp') and not cdx.get('is_live'))):

                new_url = urlrewriter.get_new_url(url=target_uri,
                                                  timestamp=cdx['timestamp'],
                                                  mod=wb_url.mod)

                resp = WbResponse.redir_response(new_url, '307 Temporary Redirect')
                if self.enable_memento:
                    if is_timegate and not is_proxy:
                        self._add_memento_links(target_uri, full_prefix,
                                                memento_dt, cdx['timestamp'],
                                                resp.status_headers,
                                                is_timegate, is_proxy)

                    else:
                        resp.status_headers['Link'] = MementoUtils.make_link(target_uri, 'original')

                return resp

        self._add_custom_params(cdx, r.headers, kwargs)

        if self._add_range(record, wb_url, range_start, range_end):
            wb_url.mod = 'id_'

        is_ajax = self.is_ajax(environ)

        if is_ajax:
            head_insert_func = None
            urlrewriter.rewrite_opts['is_ajax'] = True
        else:
            top_url = self.get_top_url(full_prefix, wb_url, cdx, kwargs)
            head_insert_func = (self.head_insert_view.
                                    create_insert_func(wb_url,
                                                       full_prefix,
                                                       host_prefix,
                                                       top_url,
                                                       environ,
                                                       framed_replay,
                                                       config=self.config))

        cookie_rewriter = None
        if self.cookie_tracker:
            cookie_rewriter = self.cookie_tracker.get_rewriter(urlrewriter,
                                                               cookie_key)

        urlrewriter.rewrite_opts['ua_string'] = environ.get('HTTP_USER_AGENT')

        result = content_rw(record, urlrewriter, cookie_rewriter, head_insert_func, cdx)

        status_headers, gen, is_rw = result

        if setcookie_headers:
            status_headers.headers.extend(setcookie_headers)

        if ' ' not in status_headers.statusline:
            status_headers.statusline += ' None'

        if not is_ajax and self.enable_memento:
            self._add_memento_links(cdx['url'], full_prefix,
                                    memento_dt, cdx['timestamp'], status_headers,
                                    is_timegate, is_proxy, cdx.get('source-coll'))

            set_content_loc = True

        if set_content_loc and not self.redirect_to_exact:
            status_headers.headers.append(('Content-Location', urlrewriter.get_new_url(timestamp=cdx['timestamp'],
                                                                                       url=cdx['url'])))
        if not is_proxy:
            self.add_csp_header(wb_url, status_headers)

        response = WbResponse(status_headers, gen)

        return response
示例#14
0
from pywb.rewrite.rewrite_live import LiveRewriter
from pywb.rewrite.url_rewriter import UrlRewriter
from pywb.rewrite.wburl import WbUrl

from pywb.utils.loaders import to_native_str

from pywb import get_test_dir

from io import BytesIO

# This module has some rewriting tests against the 'live web'
# As such, the content may change and the test may break

urlrewriter = UrlRewriter(
    '20131226101010/http://example.com/some/path/index.html', '/pywb/')
bn_urlrewriter = UrlRewriter(
    '20131226101010bn_/http://example.com/some/path/index.html', '/pywb/')


def head_insert_func(rule, cdx):
    if rule.js_rewrite_location != 'urls':
        return '<script src="/static/__pywb/wombat.js"> </script>'
    else:
        return ''


def test_csrf_token_headers():
    rewriter = LiveRewriter()
    env = {'HTTP_X_CSRFTOKEN': 'wrong', 'HTTP_COOKIE': 'csrftoken=foobar'}

    req_headers = rewriter.translate_headers('http://example.com/',
示例#15
0

"""


from pywb.rewrite.cookie_rewriter import MinimalScopeCookieRewriter
from pywb.rewrite.cookie_rewriter import get_cookie_rewriter
from pywb.rewrite.url_rewriter import UrlRewriter
import pytest
import sys

urlrewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.html',
                          'http://*****:*****@pytest.mark.skipif(sys.version_info < (2,7), reason='Unsupported')
def test_with_expires():
    # keep expires
    res = rewrite_cookie('abc=def; Path=file.html; Expires=Wed, 13 Jan 2021 22:23:01 GMT', urlrewriter2, 'coll')
    assert len(res) == 1
示例#16
0
def do_deprefix(url, rel_prefix, full_prefix):
    rewriter = UrlRewriter(url, rel_prefix, full_prefix)
    url = rewriter.deprefix_url()
    return urllib.unquote_plus(url)
示例#17
0
def do_deprefix(url, rel_prefix, full_prefix):
    rewriter = UrlRewriter(url, rel_prefix, full_prefix)
    url = rewriter.deprefix_url()
    return unquote_plus(url)
示例#18
0
    def render_content(self, wb_url, kwargs, environ):
        wb_url = wb_url.replace('#', '%23')
        wb_url = WbUrl(wb_url)

        proto = environ.get('HTTP_X_FORWARDED_PROTO', self.force_scheme)

        if proto:
            environ['wsgi.url_scheme'] = proto

        history_page = environ.pop('HTTP_X_WOMBAT_HISTORY_PAGE', '')
        if history_page:
            wb_url.url = history_page
            is_ajax = True
        else:
            is_ajax = self.is_ajax(environ)

        is_timegate = self._check_accept_dt(wb_url, environ)

        host_prefix = self.get_host_prefix(environ)
        rel_prefix = self.get_rel_prefix(environ)
        full_prefix = host_prefix + rel_prefix
        environ['pywb.host_prefix'] = host_prefix
        pywb_static_prefix = host_prefix + environ.get(
            'pywb.app_prefix', '') + environ.get('pywb.static_prefix',
                                                 '/static/')
        is_proxy = ('wsgiprox.proxy_host' in environ)

        response = self.handle_custom_response(environ, wb_url, full_prefix,
                                               host_prefix, kwargs)

        if response:
            return self.format_response(response, wb_url, full_prefix,
                                        is_timegate, is_proxy)

        if is_proxy:
            environ['pywb_proxy_magic'] = environ['wsgiprox.proxy_host']
            urlrewriter = IdentityUrlRewriter(wb_url, '')
            framed_replay = False

        else:
            urlrewriter = UrlRewriter(wb_url,
                                      prefix=full_prefix,
                                      full_prefix=full_prefix,
                                      rel_prefix=rel_prefix,
                                      pywb_static_prefix=pywb_static_prefix)

            framed_replay = self.framed_replay

        url_parts = urlsplit(wb_url.url)
        if not url_parts.path:
            return self.send_redirect('/', url_parts, urlrewriter)

        self.unrewrite_referrer(environ, full_prefix)

        urlkey = canonicalize(wb_url.url)

        if self.use_js_obj_proxy:
            content_rw = self.js_proxy_rw
        else:
            content_rw = self.default_rw

        inputreq = RewriteInputRequest(environ, urlkey, wb_url.url, content_rw)

        inputreq.include_method_query(wb_url.url)

        range_start, range_end, skip_record = self._check_range(
            inputreq, wb_url)

        setcookie_headers = None
        cookie_key = None
        if self.cookie_tracker:
            cookie_key = self.get_cookie_key(kwargs)
            if cookie_key:
                res = self.cookie_tracker.get_cookie_headers(
                    wb_url.url, urlrewriter, cookie_key,
                    environ.get('HTTP_COOKIE', ''))
                inputreq.extra_cookie, setcookie_headers = res

        r = self._do_req(inputreq, wb_url, kwargs, skip_record)

        if r.status_code >= 400:
            error = None
            try:
                error = r.raw.read()
            except Exception:
                pass
            finally:
                no_except_close(r.raw)

            if error:
                error = error.decode('utf-8')
            else:
                error = ''

            details = dict(args=kwargs, error=error)
            raise UpstreamException(r.status_code,
                                    url=wb_url.url,
                                    details=details)

        cdx = CDXObject(r.headers.get('Warcserver-Cdx').encode('utf-8'))

        cdx_url_parts = urlsplit(cdx['url'])

        if cdx_url_parts.path.endswith(
                '/') and not url_parts.path.endswith('/'):
            # add trailing slash
            new_path = url_parts.path + '/'

            no_except_close(r.raw)

            return self.send_redirect(new_path, url_parts, urlrewriter)

        stream = BufferedReader(r.raw, block_size=BUFF_SIZE)
        record = self.loader.parse_record_stream(stream,
                                                 ensure_http_headers=True)

        memento_dt = r.headers.get('Memento-Datetime')
        target_uri = r.headers.get('WARC-Target-URI')

        # cdx['urlkey'] = urlkey
        # cdx['timestamp'] = http_date_to_timestamp(memento_dt)
        # cdx['url'] = target_uri

        set_content_loc = False

        # Check if Fuzzy Match
        if target_uri != wb_url.url and cdx.get('is_fuzzy') == '1':
            set_content_loc = True

        # if redir to exact, redir if url or ts are different
        if self.redirect_to_exact:
            if (set_content_loc or (wb_url.timestamp != cdx.get('timestamp')
                                    and not cdx.get('is_live'))):

                new_url = urlrewriter.get_new_url(url=target_uri,
                                                  timestamp=cdx['timestamp'],
                                                  mod=wb_url.mod)

                resp = WbResponse.redir_response(new_url,
                                                 '307 Temporary Redirect')
                if self.enable_memento:
                    if is_timegate and not is_proxy:
                        self._add_memento_links(target_uri, full_prefix,
                                                memento_dt, cdx['timestamp'],
                                                resp.status_headers,
                                                is_timegate, is_proxy)

                    else:
                        resp.status_headers['Link'] = MementoUtils.make_link(
                            target_uri, 'original')

                return resp

        self._add_custom_params(cdx, r.headers, kwargs, record)

        if self._add_range(record, wb_url, range_start, range_end):
            wb_url.mod = 'id_'

        if is_ajax:
            head_insert_func = None
            urlrewriter.rewrite_opts['is_ajax'] = True
        else:
            top_url = self.get_top_url(full_prefix, wb_url, cdx, kwargs)
            head_insert_func = (self.head_insert_view.create_insert_func(
                wb_url,
                full_prefix,
                host_prefix,
                top_url,
                environ,
                framed_replay,
                coll=kwargs.get('coll', ''),
                replay_mod=self.replay_mod,
                config=self.config))

        cookie_rewriter = None
        if self.cookie_tracker and cookie_key:
            # skip add cookie if service worker is not 200
            # it seems cookie headers from service workers are not applied, so don't update in cache
            if wb_url.mod == 'sw_':
                cookie_key = None

            cookie_rewriter = self.cookie_tracker.get_rewriter(
                urlrewriter, cookie_key)

        urlrewriter.rewrite_opts['ua_string'] = environ.get('HTTP_USER_AGENT')

        result = content_rw(record, urlrewriter, cookie_rewriter,
                            head_insert_func, cdx, environ)

        status_headers, gen, is_rw = result

        if history_page:
            title = DefaultRewriter._extract_title(gen)
            if not title:
                title = unquote(environ.get('HTTP_X_WOMBAT_HISTORY_TITLE', ''))

            if not title:
                title = history_page

            self._add_history_page(cdx, kwargs, title)
            return WbResponse.json_response({'title': title})

        if setcookie_headers:
            status_headers.headers.extend(setcookie_headers)

        if ' ' not in status_headers.statusline:
            status_headers.statusline += ' None'

        if not is_ajax and self.enable_memento:
            self._add_memento_links(cdx['url'], full_prefix, memento_dt,
                                    cdx['timestamp'], status_headers,
                                    is_timegate, is_proxy,
                                    cdx.get('source-coll'))

            set_content_loc = True

        if set_content_loc and not self.redirect_to_exact:
            status_headers.headers.append(
                ('Content-Location',
                 urlrewriter.get_new_url(timestamp=cdx['timestamp'],
                                         url=cdx['url'])))
        if not is_proxy:
            self.add_csp_header(wb_url, status_headers)

        response = WbResponse(status_headers, gen)

        return response
示例#19
0
# Test no parsing at all
>>> p = HTMLRewriter(urlrewriter)
>>> p.close()
''

"""

from pywb.rewrite.url_rewriter import UrlRewriter
from pywb.rewrite.html_rewriter import HTMLRewriter

import pprint
import urllib

urlrewriter = UrlRewriter(
    '20131226101010/http://example.com/some/path/index.html',
    '/web/',
    rewrite_opts=dict(punycode_links=False))

urlrewriter_pencode = UrlRewriter(
    '20131226101010/http://example.com/some/path/index.html',
    '/web/',
    rewrite_opts=dict(punycode_links=True))

no_base_canon_rewriter = UrlRewriter(
    '20131226101010/http://example.com/some/path/index.html',
    '/web/',
    rewrite_opts=dict(rewrite_rel_canon=False, rewrite_base=False))


def parse(data, head_insert=None, urlrewriter=urlrewriter):
    parser = HTMLRewriter(urlrewriter, head_insert=head_insert)
示例#20
0
def do_rewrite(rel_url, base_url, prefix, mod=None, full_prefix=None):
    rewriter = UrlRewriter(base_url, prefix, full_prefix=full_prefix)
    return rewriter.rewrite(rel_url, mod)
示例#21
0
>>> rewrite_cookie('abc=def; Path=file.html; Expires=Wed, 13 Jan 2021 22:23:01 GMT')
[('Set-Cookie', 'abc=def; Path=/pywb/20131226101010/http://example.com/some/path/file.html')]

# Cookie with invalid chars, not parsed
>>> rewrite_cookie('abc@def=123')
[]

# ExactCookieRewriter
>>> rewrite_cookie('some=value; Path=/diff/path/;', urlrewriter, ExactPathCookieRewriter)
[('Set-Cookie', 'some=value')]

>>> rewrite_cookie('some=value; Domain=.example.com; Path=/diff/path/; Max-Age=1500', urlrewriter, ExactPathCookieRewriter)
[('Set-Cookie', 'some=value')]

"""

from pywb.rewrite.cookie_rewriter import MinimalScopeCookieRewriter, ExactPathCookieRewriter
from pywb.rewrite.url_rewriter import UrlRewriter

urlrewriter = UrlRewriter(
    '20131226101010/http://example.com/some/path/index.html', '/pywb/')

urlrewriter2 = UrlRewriter('em_/http://example.com/', '/preview/')


def rewrite_cookie(cookie_str,
                   rewriter=urlrewriter,
                   cookie_rewriter=MinimalScopeCookieRewriter):
    return cookie_rewriter(rewriter).rewrite(cookie_str)
示例#22
0
def new_rewriter(prefix='/web/', rewrite_opts=dict()):
    PROXY_PATH = '20131226101010/{0}'.format(ORIGINAL_URL)
    return UrlRewriter(PROXY_PATH, prefix, rewrite_opts=rewrite_opts)
示例#23
0
[]

>>> rewrite_cookie('some=value; Domain=.example.com; Path=/diff/path/; Max-Age=1500', urlrewriter, 'removeall')
[]


"""

from pywb.rewrite.cookie_rewriter import MinimalScopeCookieRewriter
from pywb.rewrite.cookie_rewriter import get_cookie_rewriter
from pywb.rewrite.url_rewriter import UrlRewriter
import pytest
import sys

urlrewriter = UrlRewriter(
    '20131226101010/http://example.com/some/path/index.html',
    'http://*****:*****@pytest.mark.skipif(sys.version_info < (2, 7), reason='Unsupported')
示例#24
0
 'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [ ('Content-Length', '200000'),
  ('Content-Type', 'image/png'),
  ('Set-Cookie', 'foo=bar; Path=/web/20131010/http://example.com/'),
  ('Content-Encoding', 'gzip'),
  ('X-Archive-Orig-Transfer-Encoding', 'chunked')]),
 'text_type': None}

"""

from pywb.rewrite.header_rewriter import HeaderRewriter
from pywb.rewrite.url_rewriter import UrlRewriter
from pywb.utils.statusandheaders import StatusAndHeaders

import pprint

urlrewriter = UrlRewriter('20131010/http://example.com/', '/web/')

headerrewriter = HeaderRewriter()


def _test_headers(headers, status='200 OK'):
    rewritten = headerrewriter.rewrite(StatusAndHeaders(status, headers),
                                       urlrewriter,
                                       urlrewriter.get_cookie_rewriter())
    return pprint.pprint(vars(rewritten))


if __name__ == "__main__":
    import doctest
    doctest.testmod()
示例#25
0
>>> parse('')
<BLANKLINE>

# Test no parsing at all
>>> p = HTMLRewriter(urlrewriter)
>>> p.close()
''

"""

from pywb.rewrite.url_rewriter import UrlRewriter
from pywb.rewrite.html_rewriter import HTMLRewriter

import pprint

urlrewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.html', '/web/')

no_base_canon_rewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.html',
                                     '/web/',
                                     rewrite_opts=dict(rewrite_rel_canon=False,
                                                       rewrite_base=False))

def parse(data, head_insert=None, urlrewriter=urlrewriter):
    parser = HTMLRewriter(urlrewriter, head_insert = head_insert)
    #data = data.decode('utf-8')
    result = parser.rewrite(data) + parser.close()
    # decode only for printing
    print result.decode('utf-8')

if __name__ == "__main__":
    import doctest
>>> _test_css("@import (\"url.css\")")
'@import ("url.css")'

>>> _test_css("@import url(/url.css)\n@import  url(/anotherurl.css)\n @import  url(/and_a_third.css)")
'@import url(/web/20131010cs_/http://example.com/url.css)\n@import  url(/web/20131010cs_/http://example.com/anotherurl.css)\n @import  url(/web/20131010cs_/http://example.com/and_a_third.css)'

"""


#=================================================================
from pywb.rewrite.url_rewriter import UrlRewriter
from pywb.rewrite.regex_rewriters import RegexRewriter, JSRewriter, CSSRewriter, XMLRewriter, RxRules
from pywb.rewrite.regex_rewriters import JSWombatProxyRewriter


urlrewriter = UrlRewriter('20131010/http://example.com/', '/web/', 'https://localhost/web/')


def _test_js(string, extra = []):
    return JSRewriter(urlrewriter, extra).rewrite(string)

def _test_js_obj_proxy(string):
    rw = JSWombatProxyRewriter(urlrewriter)
    rw.first_buff = ''
    rw.close_string = ''
    return rw.rewrite(string)

def _test_xml(string):
    return XMLRewriter(urlrewriter).rewrite(string)

def _test_css(string):
示例#27
0
    def render_content(self, wb_url, kwargs, environ):
        wb_url = wb_url.replace('#', '%23')
        wb_url = WbUrl(wb_url)

        history_page = environ.pop('HTTP_X_WOMBAT_HISTORY_PAGE', '')
        if history_page:
            wb_url.url = history_page
            is_ajax = True
        else:
            is_ajax = self.is_ajax(environ)

        is_timegate = self._check_accept_dt(wb_url, environ)

        self.prepare_env(environ)

        host_prefix = environ['pywb.host_prefix']
        rel_prefix = self.get_rel_prefix(environ)
        full_prefix = host_prefix + rel_prefix

        pywb_static_prefix = environ['pywb.static_prefix'] + '/'
        is_proxy = ('wsgiprox.proxy_host' in environ)

        # if OPTIONS in proxy mode, just generate the proxy responss
        if is_proxy and self.is_preflight(environ):
            return WbResponse.options_response(environ)

        if self.use_js_obj_proxy:
            content_rw = self.js_proxy_rw
        else:
            content_rw = self.default_rw

        # no redirects if in proxy
        redirect_to_exact = self.redirect_to_exact and not is_proxy

        # Check Prefer
        pref_mod, pref_applied = self._get_prefer_mod(wb_url, environ,
                                                      content_rw, is_proxy)

        response = None
        keep_frame_response = False

        # prefer overrides custom response?
        if pref_mod is not None:
            # fast-redirect to preferred
            if redirect_to_exact and not is_timegate and pref_mod != wb_url.mod:
                new_url = full_prefix + wb_url.to_str(mod=pref_mod)
                headers = [('Preference-Applied', pref_applied),
                           ('Vary', 'Prefer')]

                return WbResponse.redir_response(new_url,
                                                 '307 Temporary Redirect',
                                                 headers=headers)
            else:
                wb_url.mod = pref_mod
        else:
            if kwargs.get('output'):
                response = self.handle_timemap(wb_url, kwargs, full_prefix)

            elif wb_url.is_query():
                response = self.handle_query(environ, wb_url, kwargs,
                                             full_prefix)

            else:
                response = self.handle_custom_response(environ, wb_url,
                                                       full_prefix,
                                                       host_prefix, kwargs)

                keep_frame_response = (not kwargs.get('no_timegate_check')
                                       and is_timegate
                                       and not is_proxy) or redirect_to_exact

        if response and not keep_frame_response:
            return self.format_response(response, wb_url, full_prefix,
                                        is_timegate, is_proxy)

        if is_proxy:
            environ['pywb_proxy_magic'] = environ['wsgiprox.proxy_host']
            urlrewriter = IdentityUrlRewriter(wb_url, '')
            framed_replay = False

        else:
            urlrewriter = UrlRewriter(wb_url,
                                      prefix=full_prefix,
                                      full_prefix=full_prefix,
                                      rel_prefix=rel_prefix,
                                      pywb_static_prefix=pywb_static_prefix)

            framed_replay = self.framed_replay

        url_parts = urlsplit(wb_url.url)
        if not url_parts.path:
            return self.send_redirect('/', url_parts, urlrewriter)

        self.unrewrite_referrer(environ, full_prefix)

        urlkey = canonicalize(wb_url.url)

        inputreq = RewriteInputRequest(environ, urlkey, wb_url.url, content_rw)

        inputreq.include_method_query(wb_url.url)

        range_start, range_end, skip_record = self._check_range(
            inputreq, wb_url)

        setcookie_headers = None
        cookie_key = None
        if self.cookie_tracker:
            cookie_key = self.get_cookie_key(kwargs)
            if cookie_key:
                res = self.cookie_tracker.get_cookie_headers(
                    wb_url.url, urlrewriter, cookie_key,
                    environ.get('HTTP_COOKIE', ''))
                inputreq.extra_cookie, setcookie_headers = res

        r = self._do_req(inputreq, wb_url, kwargs, skip_record)

        if r.status_code >= 400:
            error = None
            try:
                error = r.raw.read()
            except Exception:
                pass
            finally:
                no_except_close(r.raw)

            if error:
                error = error.decode('utf-8')
            else:
                error = ''

            details = dict(args=kwargs, error=error)
            if r.status_code == 404:
                raise NotFoundException(url=wb_url.url, msg=details)

            else:
                raise UpstreamException(r.status_code,
                                        url=wb_url.url,
                                        details=details)

        cdx = CDXObject(r.headers.get('Warcserver-Cdx').encode('utf-8'))

        cdx_url_parts = urlsplit(cdx['url'])

        if cdx_url_parts.path.endswith(
                '/') and not url_parts.path.endswith('/'):
            # add trailing slash
            new_path = url_parts.path + '/'

            no_except_close(r.raw)

            return self.send_redirect(new_path, url_parts, urlrewriter)

        # only redirect to exact if not live, otherwise set to false
        redirect_to_exact = redirect_to_exact and not cdx.get('is_live')

        # return top-frame timegate response, with timestamp from cdx
        if response and keep_frame_response and (not redirect_to_exact
                                                 or not is_timegate):
            no_except_close(r.raw)
            return self.format_response(response, wb_url, full_prefix,
                                        is_timegate, is_proxy,
                                        cdx['timestamp'])

        stream = BufferedReader(r.raw, block_size=BUFF_SIZE)
        record = self.loader.parse_record_stream(stream,
                                                 ensure_http_headers=True)

        memento_dt = r.headers.get('Memento-Datetime')
        target_uri = r.headers.get('WARC-Target-URI')

        # cdx['urlkey'] = urlkey
        # cdx['timestamp'] = http_date_to_timestamp(memento_dt)
        # cdx['url'] = target_uri

        set_content_loc = False

        # Check if Fuzzy Match
        if target_uri != wb_url.url and cdx.get('is_fuzzy') == '1':
            set_content_loc = True

        # if redirect to exact timestamp (only set if not live)
        if redirect_to_exact:
            if set_content_loc or is_timegate or wb_url.timestamp != cdx.get(
                    'timestamp'):
                new_url = urlrewriter.get_new_url(url=target_uri,
                                                  timestamp=cdx['timestamp'],
                                                  mod=wb_url.mod)

                resp = WbResponse.redir_response(new_url,
                                                 '307 Temporary Redirect')
                if self.enable_memento:
                    if is_timegate and not is_proxy:
                        self._add_memento_links(target_uri,
                                                full_prefix,
                                                memento_dt,
                                                cdx['timestamp'],
                                                resp.status_headers,
                                                is_timegate,
                                                is_proxy,
                                                pref_applied=pref_applied,
                                                mod=pref_mod,
                                                is_memento=False)

                    else:
                        resp.status_headers['Link'] = MementoUtils.make_link(
                            target_uri, 'original')

                return resp

        self._add_custom_params(cdx, r.headers, kwargs, record)

        if self._add_range(record, wb_url, range_start, range_end):
            wb_url.mod = 'id_'

        if is_ajax:
            head_insert_func = None
            urlrewriter.rewrite_opts['is_ajax'] = True
        else:
            top_url = self.get_top_url(full_prefix, wb_url, cdx, kwargs)
            head_insert_func = (self.head_insert_view.create_insert_func(
                wb_url,
                full_prefix,
                host_prefix,
                top_url,
                environ,
                framed_replay,
                coll=kwargs.get('coll', ''),
                replay_mod=self.replay_mod,
                metadata=kwargs.get('metadata', {}),
                config=self.config))

        cookie_rewriter = None
        if self.cookie_tracker and cookie_key:
            # skip add cookie if service worker is not 200
            # it seems cookie headers from service workers are not applied, so don't update in cache
            if wb_url.mod == 'sw_':
                cookie_key = None

            cookie_rewriter = self.cookie_tracker.get_rewriter(
                urlrewriter, cookie_key)

        urlrewriter.rewrite_opts['ua_string'] = environ.get('HTTP_USER_AGENT')

        result = content_rw(record, urlrewriter, cookie_rewriter,
                            head_insert_func, cdx, environ)

        status_headers, gen, is_rw = result

        if history_page:
            title = DefaultRewriter._extract_title(gen)
            if not title:
                title = unquote(environ.get('HTTP_X_WOMBAT_HISTORY_TITLE', ''))

            if not title:
                title = history_page

            self._add_history_page(cdx, kwargs, title)
            return WbResponse.json_response({'title': title})

        if setcookie_headers:
            status_headers.headers.extend(setcookie_headers)

        if ' ' not in status_headers.statusline:
            status_headers.statusline += ' None'

        if not is_ajax and self.enable_memento:
            self._add_memento_links(cdx['url'],
                                    full_prefix,
                                    memento_dt,
                                    cdx['timestamp'],
                                    status_headers,
                                    is_timegate,
                                    is_proxy,
                                    cdx.get('source-coll'),
                                    mod=pref_mod,
                                    pref_applied=pref_applied)

            set_content_loc = True

        if set_content_loc and not redirect_to_exact and not is_proxy:
            status_headers.headers.append(
                ('Content-Location',
                 urlrewriter.get_new_url(timestamp=cdx['timestamp'],
                                         url=cdx['url'])))

        if not is_proxy:
            self.add_csp_header(wb_url, status_headers)

        response = WbResponse(status_headers, gen)

        if is_proxy and environ.get('HTTP_ORIGIN'):
            response.add_access_control_headers(environ)

        if r.status_code == 200 and kwargs.get(
                'cache') == 'always' and environ.get('HTTP_REFERER'):
            response.status_headers[
                'Cache-Control'] = 'public, max-age=31536000, immutable'

        return response