Пример #1
0
    def check_if_content(self, wb_url, environ):
        wb_url = WbUrl(wb_url)
        if (wb_url.is_replay()):
            environ['is_content'] = True

            if (self.content_host and not self.is_content_request()
                    and wb_url.mod != ''
                    and not wb_url.mod.startswith('$br:')):
                self.redir_host(self.content_host)
Пример #2
0
    def check_remote_archive(self, wb_url, mode, wb_url_obj=None):
        wb_url_obj = wb_url_obj or WbUrl(wb_url)

        res = self.wam_loader.find_archive_for_url(wb_url_obj.url)
        if not res:
            return

        pk, new_url, id_ = res

        mode = 'extract:' + id_

        new_url = WbUrl(new_url).to_str(mod=wb_url_obj.mod)

        return mode, new_url
Пример #3
0
    def redir_no_cookie(self):
        path = request.environ['PATH_INFO']
        prefix, wb_url = path[1:].split('/', 1)
        wb_url = WbUrl(wb_url)

        print(repr(wb_url))

        if wb_url.mod != '':
            url = '/' + prefix + '/'
            url += wb_url.to_str(mod='')

            if request.environ.get('QUERY_STRING'):
                url += '?' + request.environ['QUERY_STRING']

            redirect(url)
Пример #4
0
    def rewrite_record(self, headers, content, ts, url='http://example.com/',
                       prefix='http://localhost:8080/prefix/', warc_headers=None,
                       request_url=None, is_live=None, use_js_proxy=True, environ=None):

        record = self._create_response_record(url, headers, content, warc_headers)

        wburl = WbUrl(ts + '/' + (request_url or url))
        url_rewriter = UrlRewriter(wburl, prefix)

        cdx = CDXObject()
        cdx['url'] = url
        cdx['timestamp'] = ts
        cdx['urlkey'] = canonicalize(url)
        if request_url != url:
            cdx['is_fuzzy'] = '1'
        cdx['is_live'] = is_live

        def insert_func(rule, cdx):
            return ''

        if use_js_proxy:
            rewriter = self.js_proxy_content_rewriter
        else:
            rewriter = self.content_rewriter

        return rewriter(record, url_rewriter, cookie_rewriter=None,
                        head_insert_func=insert_func,
                        cdx=cdx,
                        environ=environ)
Пример #5
0
    def rewrite_record(self,
                       headers,
                       content,
                       ts,
                       url='http://example.com/',
                       prefix='http://localhost:8080/prefix/',
                       warc_headers=None,
                       request_url=None,
                       is_live=None):

        record = self._create_response_record(url, headers, content,
                                              warc_headers)

        wburl = WbUrl(ts + '/' + (request_url or url))
        url_rewriter = UrlRewriter(wburl, prefix)

        cdx = CDXObject()
        cdx['url'] = url
        cdx['timestamp'] = ts
        cdx['urlkey'] = canonicalize(url)
        if request_url != url:
            cdx['is_fuzzy'] = '1'
        cdx['is_live'] = is_live

        return self.content_rewriter(record, url_rewriter, None, cdx=cdx)
Пример #6
0
    def __init__(self,
                 wburl,
                 prefix='',
                 full_prefix=None,
                 rel_prefix=None,
                 root_path=None,
                 cookie_scope=None,
                 rewrite_opts=None,
                 pywb_static_prefix=None):
        self.wburl = wburl if isinstance(wburl, WbUrl) else WbUrl(wburl)
        self.prefix = prefix
        self.full_prefix = full_prefix or prefix
        self.rel_prefix = rel_prefix or prefix
        self.root_path = root_path or '/'
        if self.full_prefix and self.full_prefix.startswith(self.PROTOCOLS):
            self.prefix_scheme = self.full_prefix.split(':')[0]
        else:
            self.prefix_scheme = None
        self.prefix_abs = self.prefix and self.prefix.startswith(
            self.PROTOCOLS)
        self.cookie_scope = cookie_scope
        self.rewrite_opts = rewrite_opts or {}
        self._pywb_static_prefix = pywb_static_prefix

        if self.rewrite_opts.get('punycode_links'):
            self.wburl._do_percent_encode = False
Пример #7
0
    def rebase_rewriter(self, new_url):
        if new_url.startswith(self.prefix):
            new_url = new_url[len(self.prefix):]
        elif new_url.startswith(self.rel_prefix):
            new_url = new_url[len(self.rel_prefix):]

        new_wburl = WbUrl(new_url)
        return self._create_rebased_rewriter(new_wburl, self.prefix)
Пример #8
0
    def rebase_rewriter(self, base_url):
        if not base_url.startswith(self.PROTOCOLS):
            base_url = self.urljoin(self.wburl.url, base_url)

        new_wburl_str = self.wburl.to_str(url=base_url)
        new_wburl = WbUrl(new_wburl_str)

        return self._create_rebased_rewriter(new_wburl, self.prefix)
Пример #9
0
    def __init__(self, orig_url):
        import re
        import six

        from six.moves.urllib.parse import urlsplit, urlunsplit
        from six.moves.urllib.parse import quote_plus, quote, unquote_plus

        from pywb.utils.loaders import to_native_str
        from pywb.rewrite.wburl import WbUrl

        pywb.rewrite.wburl.BaseWbUrl.__init__(self)

        if six.PY2 and isinstance(orig_url, six.text_type):
            orig_url = orig_url.encode('utf-8')
            orig_url = quote(orig_url)

        self._original_url = orig_url

        if not self._init_query(orig_url):
            if not self._init_replay(orig_url):
                raise Exception('Invalid WbUrl: ', orig_url)

        new_uri = WbUrl.to_uri(self.url)

        self._do_percent_encode = True

        self.url = new_uri

        # begin brozzler changes
        if (self.url.startswith('urn:') or self.url.startswith('screenshot:')
                or self.url.startswith('thumbnail:')):
            return
        # end brozzler changes

        # protocol agnostic url -> http://
        # no protocol -> http://
        #inx = self.url.find('://')
        inx = -1
        m = self.SCHEME_RX.match(self.url)
        if m:
            inx = m.span(1)[0]

        #if inx < 0:
        # check for other partially encoded variants
        #    m = self.PARTIAL_ENC_RX.match(self.url)
        #    if m:
        #        len_ = len(m.group(0))
        #        self.url = (urllib.unquote_plus(self.url[:len_]) +
        #                    self.url[len_:])
        #        inx = self.url.find(':/')

        if inx < 0:
            self.url = self.DEFAULT_SCHEME + self.url
        else:
            inx += 2
            if inx < len(self.url) and self.url[inx] != '/':
                self.url = self.url[:inx] + '/' + self.url[inx:]
Пример #10
0
    def __init__(self, orig_url):
        import re
        import six

        from six.moves.urllib.parse import urlsplit, urlunsplit
        from six.moves.urllib.parse import quote_plus, quote, unquote_plus

        from pywb.utils.loaders import to_native_str
        from pywb.rewrite.wburl import WbUrl

        pywb.rewrite.wburl.BaseWbUrl.__init__(self)

        if six.PY2 and isinstance(orig_url, six.text_type):
            orig_url = orig_url.encode('utf-8')
            orig_url = quote(orig_url)

        self._original_url = orig_url

        if not self._init_query(orig_url):
            if not self._init_replay(orig_url):
                raise Exception('Invalid WbUrl: ', orig_url)

        new_uri = WbUrl.to_uri(self.url)

        self._do_percent_encode = True

        self.url = new_uri

        # begin brozzler changes
        if (self.url.startswith('urn:') or self.url.startswith('screenshot:')
                or self.url.startswith('thumbnail:')):
            return
        # end brozzler changes

        # protocol agnostic url -> http://
        # no protocol -> http://
        #inx = self.url.find('://')
        inx = -1
        m = self.SCHEME_RX.match(self.url)
        if m:
            inx = m.span(1)[0]

        #if inx < 0:
            # check for other partially encoded variants
        #    m = self.PARTIAL_ENC_RX.match(self.url)
        #    if m:
        #        len_ = len(m.group(0))
        #        self.url = (urllib.unquote_plus(self.url[:len_]) +
        #                    self.url[len_:])
        #        inx = self.url.find(':/')

        if inx < 0:
            self.url = self.DEFAULT_SCHEME + self.url
        else:
            inx += 2
            if inx < len(self.url) and self.url[inx] != '/':
                self.url = self.url[:inx] + '/' + self.url[inx:]
Пример #11
0
    def unrewrite_referrer(self, environ, full_prefix):
        referrer = environ.get('HTTP_REFERER')
        if not referrer:
            return False

        if referrer.startswith(full_prefix):
            referrer = referrer[len(full_prefix):]
            if referrer:
                environ['HTTP_REFERER'] = WbUrl(referrer).url
                return True

        return False
Пример #12
0
def make_timemap(wbrequest, cdx_lines):
    prefix = wbrequest.wb_prefix
    url = wbrequest.wb_url.url
    mod = wbrequest.options.get('replay_mod', '')

    # get first memento as it'll be used for 'from' field
    try:
        first_cdx = six.next(cdx_lines)
        from_date = timestamp_to_http_date(first_cdx['timestamp'])
    except StopIteration:
        first_cdx = None

    if first_cdx:
        # timemap link
        timemap = ('<{0}>; rel="self"; ' +
                   'type="application/link-format"; from="{1}",\n')
        yield timemap.format(prefix + wbrequest.wb_url.to_str(), from_date)

    # original link
    original = '<{0}>; rel="original",\n'
    yield original.format(url)

    # timegate link
    timegate = '<{0}>; rel="timegate",\n'
    timegate_url = WbUrl.to_wburl_str(url=url,
                                      mod=mod,
                                      type=WbUrl.LATEST_REPLAY)

    yield timegate.format(prefix + timegate_url)

    if not first_cdx:
        # terminating timemap link, no from
        timemap = ('<{0}>; rel="self"; type="application/link-format"')
        yield timemap.format(prefix + wbrequest.wb_url.to_str())
        return

    # first memento link
    yield make_timemap_memento_link(first_cdx,
                                    prefix,
                                    datetime=from_date,
                                    mod=mod)

    prev_cdx = None

    for cdx in cdx_lines:
        if prev_cdx:
            yield make_timemap_memento_link(prev_cdx, prefix, mod=mod)

        prev_cdx = cdx

    # last memento link, if any
    if prev_cdx:
        yield make_timemap_memento_link(prev_cdx, prefix, end='', mod=mod)
Пример #13
0
def make_timemap(wbrequest, cdx_lines):
    prefix = wbrequest.wb_prefix
    url = wbrequest.wb_url.url
    mod = wbrequest.options.get('replay_mod', '')

    # get first memento as it'll be used for 'from' field
    try:
        first_cdx = six.next(cdx_lines)
        from_date = timestamp_to_http_date(first_cdx['timestamp'])
    except StopIteration:
        first_cdx = None


    if first_cdx:
        # timemap link
        timemap = ('<{0}>; rel="self"; ' +
                   'type="application/link-format"; from="{1}",\n')
        yield timemap.format(prefix + wbrequest.wb_url.to_str(),
                             from_date)

    # original link
    original = '<{0}>; rel="original",\n'
    yield original.format(url)

    # timegate link
    timegate = '<{0}>; rel="timegate",\n'
    timegate_url= WbUrl.to_wburl_str(url=url,
                                     mod=mod,
                                     type=WbUrl.LATEST_REPLAY)

    yield timegate.format(prefix + timegate_url)

    if not first_cdx:
        # terminating timemap link, no from
        timemap = ('<{0}>; rel="self"; type="application/link-format"')
        yield timemap.format(prefix + wbrequest.wb_url.to_str())
        return

    # first memento link
    yield make_timemap_memento_link(first_cdx, prefix,
                            datetime=from_date, mod=mod)

    prev_cdx = None

    for cdx in cdx_lines:
        if prev_cdx:
            yield make_timemap_memento_link(prev_cdx, prefix, mod=mod)

        prev_cdx = cdx

    # last memento link, if any
    if prev_cdx:
        yield make_timemap_memento_link(prev_cdx, prefix, end='', mod=mod)
Пример #14
0
def make_memento_link(cdx, prefix, datetime=None, rel='memento', end=',\n'):
    memento = '<{0}>; rel="{1}"; datetime="{2}"' + end

    string = WbUrl.to_wburl_str(url=cdx['original'],
                                mod='',
                                timestamp=cdx['timestamp'],
                                type=WbUrl.REPLAY)

    url = prefix + string

    if not datetime:
        datetime = timestamp_to_http_date(cdx['timestamp'])

    return memento.format(url, rel, datetime)
Пример #15
0
def make_memento_link(cdx, prefix, datetime=None, rel='memento', end=',\n'):
    memento = '<{0}>; rel="{1}"; datetime="{2}"' + end

    string = WbUrl.to_wburl_str(url=cdx['original'],
                                mod='',
                                timestamp=cdx['timestamp'],
                                type=WbUrl.REPLAY)

    url = prefix + string

    if not datetime:
        datetime = timestamp_to_http_date(cdx['timestamp'])

    return memento.format(url, rel, datetime)
Пример #16
0
    def handle_routing(self, wb_url, user, coll, rec, type,
                       is_embed=False,
                       is_display=False,
                       sources='',
                       inv_sources='',
                       redir_route=None):

        wb_url = self.add_query(wb_url)
        if user == '_new' and redir_route:
            return self.do_create_new_and_redir(coll, rec, wb_url, redir_route)

        sesh = self.get_session()

        if sesh.is_new() and self.is_content_request():
            self.redir_set_session()

        remote_ip = None
        frontend_cache_header = None
        patch_rec = ''

        if type in self.MODIFY_MODES:
            if not self.manager.has_recording(user, coll, rec):
                self._redir_if_sanitized(self.sanitize_title(rec),
                                         rec,
                                         wb_url)

                # don't auto create recording for inner frame w/o accessing outer frame
                raise HTTPError(404, 'No Such Recording')

            elif not self.manager.is_recording_open(user, coll, rec):
                # force creation of new recording as this one is closed
                raise HTTPError(404, 'Recording not open')

            self.manager.assert_can_write(user, coll)

            if self.manager.is_out_of_space(user):
                raise HTTPError(402, 'Out of Space')

            remote_ip = self._get_remote_ip()

            if self.manager.is_rate_limited(user, remote_ip):
                raise HTTPError(402, 'Rate Limit')

            if inv_sources and inv_sources != '*':
                patch_rec = self.patch_of_name(rec, True)

        if type == 'replay-coll':
            res = self.manager.has_collection_is_public(user, coll)
            if not res:
                self._redir_if_sanitized(self.sanitize_title(coll),
                                         coll,
                                         wb_url)

                raise HTTPError(404, 'No Such Collection')

            if res != 'public':
                frontend_cache_header = ('Cache-Control', 'private')

        elif type == 'replay':
            if not self.manager.has_recording(user, coll, rec):
                raise HTTPError(404, 'No Such Recording')

        request.environ['SCRIPT_NAME'] = quote(request.environ['SCRIPT_NAME'], safe='/:')

        wb_url = self._context_massage(wb_url)

        wb_url_obj = WbUrl(wb_url)
        is_top_frame = (wb_url_obj.mod == self.frame_mod or wb_url_obj.mod.startswith('$br:'))

        if type == 'record' and is_top_frame:
            result = self.check_remote_archive(wb_url, type, wb_url_obj)
            if result:
                mode, wb_url = result
                new_url = '/{user}/{coll}/{rec}/{mode}/{url}'.format(user=user,
                                                                     coll=coll,
                                                                     rec=rec,
                                                                     mode=mode,
                                                                     url=wb_url)
                return self.redirect(new_url)

        elif type == 'replay-coll' and not is_top_frame:
            self.manager.sync_coll_index(user, coll, exists=False,
                                         do_async=False)

        kwargs = dict(user=user,
                      coll_orig=coll,
                      id=sesh.get_id(),
                      rec_orig=rec,
                      coll=quote(coll),
                      rec=quote(rec, safe='/*'),
                      type=type,
                      sources=sources,
                      inv_sources=inv_sources,
                      patch_rec=patch_rec,
                      ip=remote_ip,
                      is_embed=is_embed,
                      is_display=is_display,
                      use_js_obj_proxy=True)

        try:
            self.check_if_content(wb_url_obj, request.environ, is_top_frame)

            resp = self.render_content(wb_url, kwargs, request.environ)

            if frontend_cache_header:
                resp.status_headers.headers.append(frontend_cache_header)

            resp = HTTPResponse(body=resp.body,
                                status=resp.status_headers.statusline,
                                headers=resp.status_headers.headers)

            return resp

        except UpstreamException as ue:
            @self.jinja2_view('content_error.html')
            def handle_error(status_code, type, url, err_info):
                response.status = status_code
                return {'url': url,
                        'status': status_code,
                        'error': err_info.get('error'),
                        'user': self.get_view_user(user),
                        'coll': coll,
                        'rec': rec,
                        'type': type,
                        'app_host': self.app_host,
                       }

            return handle_error(ue.status_code, type, ue.url, ue.msg)
    def handle_routing(self,
                       wb_url,
                       user,
                       coll,
                       rec,
                       type,
                       is_embed=False,
                       is_display=False,
                       sources='',
                       inv_sources='',
                       redir_route=None):

        wb_url = self.add_query(wb_url)
        if user == '_new' and redir_route:
            return self.do_create_new_and_redir(coll, rec, wb_url, redir_route)

        not_found = False

        sesh = self.get_session()

        if sesh.is_new() and self.is_content_request():
            self.redir_set_session()

        remote_ip = None

        if type == 'replay' or type in self.MODIFY_MODES:
            if not self.manager.has_recording(user, coll, rec):
                not_found = True

            if type != 'replay':
                self.manager.assert_can_write(user, coll)

                if self.manager.is_out_of_space(user):
                    raise HTTPError(402, 'Out of Space')

                remote_ip = self._get_remote_ip()

                if self.manager.is_rate_limited(user, remote_ip):
                    raise HTTPError(402, 'Rate Limit')

        if ((not_found or type == 'replay-coll')
                and (not (self.manager.is_anon(user) and coll == 'temp'))
                and (not self.manager.has_collection(user, coll))):

            self._redir_if_sanitized(self.sanitize_title(coll), coll, wb_url)

            raise HTTPError(404, 'No Such Collection')

        if not_found:
            title = rec

            if type in self.MODIFY_MODES:
                rec = self._create_new_rec(user,
                                           coll,
                                           title,
                                           type,
                                           no_dupe=True)

            self._redir_if_sanitized(rec, title, wb_url)

            if type == 'replay':
                raise HTTPError(404, 'No Such Recording')

        patch_rec = ''

        if inv_sources and inv_sources != '*':
            patch_rec = self._create_new_rec(user,
                                             coll,
                                             'Patch of ' + rec,
                                             mode='patch',
                                             no_dupe=True)

        request.environ['SCRIPT_NAME'] = quote(request.environ['SCRIPT_NAME'],
                                               safe='/:')

        wb_url = self._context_massage(wb_url)

        wb_url_obj = WbUrl(wb_url)

        kwargs = dict(user=user,
                      coll_orig=coll,
                      id=sesh.get_id(),
                      rec_orig=rec,
                      coll=quote(coll),
                      rec=quote(rec, safe='/*'),
                      type=type,
                      sources=sources,
                      inv_sources=inv_sources,
                      patch_rec=patch_rec,
                      ip=remote_ip,
                      is_embed=is_embed,
                      is_display=is_display)

        try:
            self.check_if_content(wb_url_obj, request.environ)

            resp = self.render_content(wb_url, kwargs, request.environ)

            self.add_csp_header(wb_url_obj, resp.status_headers)

            resp = HTTPResponse(body=resp.body,
                                status=resp.status_headers.statusline,
                                headers=resp.status_headers.headers)

            return resp

        except UpstreamException as ue:

            @self.jinja2_view('content_error.html')
            def handle_error(status_code, type, url, err_info):
                response.status = status_code
                return {
                    'url': url,
                    'status': status_code,
                    'error': err_info.get('error'),
                    'user': self.get_view_user(user),
                    'coll': coll,
                    'rec': rec,
                    'type': type,
                    'app_host': self.app_host,
                }

            return handle_error(ue.status_code, type, ue.url, ue.msg)
Пример #18
0
    def __call__(self, env, the_router):
        referrer = env.get('HTTP_REFERER')

        routes = the_router.routes

        # ensure there is a referrer
        if referrer is None:
            return None

        # get referrer path name
        ref_split = urlsplit(referrer)

        # require that referrer starts with current Host, if any
        curr_host = env.get('HTTP_HOST')
        if curr_host and curr_host != ref_split.netloc:
            return None

        path = ref_split.path

        app_path = env.get('SCRIPT_NAME', '')

        if app_path:
            # must start with current app name, if not root
            if not path.startswith(app_path):
                return None

            path = path[len(app_path):]

        ref_route = None
        ref_request = None

        for route in routes:
            matcher, coll = route.is_handling(path)
            if matcher:
                ref_request = the_router.parse_request(route, env, matcher,
                                                       coll, path)
                ref_route = route
                break

        # must have matched one of the routes with a urlrewriter
        if not ref_request or not ref_request.urlrewriter:
            return None

        rewriter = ref_request.urlrewriter

        rel_request_uri = env['REL_REQUEST_URI']

        timestamp_path = '/' + rewriter.wburl.timestamp + '/'

        # check if timestamp is already part of the path
        if rel_request_uri.startswith(timestamp_path):
            # remove timestamp but leave / to make host relative url
            # 2013/path.html -> /path.html
            rel_request_uri = rel_request_uri[len(timestamp_path) - 1:]

        rewritten_url = rewriter.rewrite(rel_request_uri)

        # if post, can't redirect as that would lost the post data
        # (can't use 307 because FF will show confirmation warning)
        if ref_request.method == 'POST':
            new_wb_url = WbUrl(rewritten_url[len(rewriter.prefix):])
            ref_request.wb_url.url = new_wb_url.url
            return ref_route.handler(ref_request)

        final_url = urlunsplit(
            (ref_split.scheme, ref_split.netloc, rewritten_url, '', ''))

        return WbResponse.redir_response(final_url, status='302 Temp Redirect')
Пример #19
0
    def render_content(self, wbrequest):
        if wbrequest.wb_url.mod == 'vi_':
            return self._get_video_info(wbrequest)

        head_insert_func = self.head_insert_view.create_insert_func(wbrequest)
        req_headers = self._live_request_headers(wbrequest)

        ref_wburl_str = wbrequest.extract_referrer_wburl_str()
        if ref_wburl_str:
            wbrequest.env['REL_REFERER'] = WbUrl(ref_wburl_str).url

        ignore_proxies = False
        use_206 = False
        url = None
        rangeres = None

        readd_range = False
        cache_key = None

        if self.proxies:
            rangeres = wbrequest.extract_range()

            if rangeres:
                url, start, end, use_206 = rangeres

                # if bytes=0- Range request,
                # simply remove the range and still proxy
                if start == 0 and not end and use_206:
                    wbrequest.wb_url.url = url
                    del wbrequest.env['HTTP_RANGE']
                    readd_range = True
                else:
                    # disables proxy
                    ignore_proxies = True

                    # sets cache_key only if not already cached
                    cache_key = self._get_cache_key('r:', url)

        result = self.rewriter.fetch_request(wbrequest.wb_url.url,
                                             wbrequest.urlrewriter,
                                             head_insert_func=head_insert_func,
                                             req_headers=req_headers,
                                             env=wbrequest.env,
                                             ignore_proxies=ignore_proxies,
                                             verify=self.verify)

        wbresponse = self._make_response(wbrequest, *result)

        if readd_range:
            content_length = (
                wbresponse.status_headers.get_header('Content-Length'))
            try:
                content_length = int(content_length)
                wbresponse.status_headers.add_range(0, content_length,
                                                    content_length)
            except (ValueError, TypeError):
                pass

        if cache_key:
            self._add_proxy_ping(cache_key, url, wbrequest, wbresponse)

        if rangeres:
            referrer = wbrequest.env.get('REL_REFERER')

            # also ping video info
            if referrer:
                try:
                    resp = self._get_video_info(wbrequest,
                                                info_url=referrer,
                                                video_url=url)
                except:
                    print('Error getting video info')

        return wbresponse
Пример #20
0
    def handle_magic_page(self, env):
        request_url = env['REL_REQUEST_URI']
        parts = urlparse.urlsplit(request_url)
        server_name = env['pywb.proxy_host']

        path_url = parts.path[1:]
        if parts.query:
            path_url += '?' + parts.query

        if server_name.startswith('auto'):
            coll, ts, sesh_id = self.get_coll(env)

            if coll:
                return self.make_sethost_cookie_response(sesh_id,
                                                         path_url,
                                                         env)
            else:
                return self.make_magic_response('select', path_url, env)

        elif server_name.startswith('query.'):
            wb_url = WbUrl(path_url)

            # only dealing with specific timestamp setting
            if wb_url.is_query():
                return None

            coll, ts, sesh_id = self.get_coll(env)
            if not coll:
                return self.make_magic_response('select', path_url, env)

            self.set_ts(sesh_id, wb_url.timestamp)
            return self.make_redir_response(wb_url.url)

        elif server_name.endswith(self.set_prefix):
            old_sesh_id = extract_client_cookie(env, self.cookie_name)
            sesh_id = self.create_renew_sesh_id(old_sesh_id)

            if sesh_id != old_sesh_id:
                headers = self.make_cookie_headers(sesh_id, self.magic_name)
            else:
                headers = None

            coll = server_name[:-len(self.set_prefix)]

            # set sesh value
            self.set_coll(sesh_id, coll)

            return self.make_sethost_cookie_response(sesh_id, path_url, env,
                                                     headers=headers)

        elif self.sethost_prefix in server_name:
            inx = server_name.find(self.sethost_prefix)
            sesh_id = server_name[:inx]

            domain = server_name[inx + len(self.sethost_prefix):]

            headers = self.make_cookie_headers(sesh_id, domain)

            full_url = env['pywb.proxy_scheme'] + '://' + domain
            full_url += '/' + path_url
            return self.make_redir_response(full_url, headers=headers)

        elif 'select.' in server_name:
            coll, ts, sesh_id = self.get_coll(env)

            route_temp = '-set.' + self.magic_name + '/' + path_url

            return (self.proxy_select_view.
                    render_response(routes=self.routes,
                                    route_temp=route_temp,
                                    coll=coll,
                                    url=path_url))
Пример #21
0
    def handle_routing(self,
                       wb_url,
                       user,
                       coll_name,
                       rec_name,
                       type,
                       is_embed=False,
                       is_display=False,
                       sources='',
                       inv_sources='',
                       redir_route=None):

        wb_url = self._full_url(wb_url)
        if user == '_new' and redir_route:
            return self.do_create_new_and_redir(coll_name, rec_name, wb_url,
                                                redir_route)

        sesh = self.get_session()

        remote_ip = None
        frontend_cache_header = None
        patch_recording = None

        the_user, collection, recording = self.user_manager.get_user_coll_rec(
            user, coll_name, rec_name)

        if not the_user:
            msg = 'not_found' if user == 'api' else 'no_such_user'
            self._raise_error(404, msg)

        coll = collection.my_id if collection else None
        rec = recording.my_id if recording else None

        if type in self.MODIFY_MODES:
            if sesh.is_new() and self.is_content_request():
                self.redir_set_session()

            if not recording:
                self._redir_if_sanitized(self.sanitize_title(rec_name),
                                         rec_name, wb_url)

                # don't auto create recording for inner frame w/o accessing outer frame
                self._raise_error(404, 'no_such_recording')

            elif not recording.is_open():
                # force creation of new recording as this one is closed
                self._raise_error(400, 'recording_not_open')

            collection.access.assert_can_write_coll(collection)

            if the_user.is_out_of_space():
                self._raise_error(402, 'out_of_space')

            remote_ip = self._get_remote_ip()

            remote_ip = self.check_rate_limit(the_user, remote_ip)

            if inv_sources and inv_sources != '*':
                #patch_rec_name = self.patch_of_name(rec, True)
                patch_recording = recording.get_patch_recording()
                #patch_recording = collection.get_recording_by_name(patch_rec_name)

        if type in ('replay-coll', 'replay'):
            if not collection:
                self._redir_if_sanitized(self.sanitize_title(coll_name),
                                         coll_name, wb_url)

                if sesh.is_new() and self.is_content_request():
                    self.redir_set_session()
                else:
                    self._raise_error(404, 'no_such_collection')

            access = self.access.check_read_access_public(collection)

            if not access:
                if sesh.is_new() and self.is_content_request():
                    self.redir_set_session()
                else:
                    self._raise_error(404, 'no_such_collection')

            if access != 'public':
                frontend_cache_header = ('Cache-Control', 'private')

            if type == 'replay':
                if not recording:
                    self._raise_error(404, 'no_such_recording')

        request.environ['SCRIPT_NAME'] = quote(request.environ['SCRIPT_NAME'],
                                               safe='/:')

        wb_url = self._context_massage(wb_url)

        wb_url_obj = WbUrl(wb_url)

        is_top_frame = (wb_url_obj.mod == self.frame_mod
                        or wb_url_obj.mod.startswith('$br:'))

        if type == 'record' and is_top_frame:
            result = self.check_remote_archive(wb_url, type, wb_url_obj)
            if result:
                mode, wb_url = result
                new_url = '/{user}/{coll}/{rec}/{mode}/{url}'.format(
                    user=user,
                    coll=coll_name,
                    rec=rec_name,
                    mode=mode,
                    url=wb_url)
                return self.redirect(new_url)

        elif type == 'replay-coll' and not is_top_frame:
            collection.sync_coll_index(exists=False, do_async=False)

        kwargs = dict(
            user=user,
            id=sesh.get_id(),
            coll=coll,
            rec=rec,
            coll_name=quote(coll_name),
            rec_name=quote(rec_name, safe='/*'),
            the_user=the_user,
            collection=collection,
            recording=recording,
            patch_recording=patch_recording,
            type=type,
            sources=sources,
            inv_sources=inv_sources,
            patch_rec=patch_recording.my_id if patch_recording else None,
            ip=remote_ip,
            is_embed=is_embed,
            is_display=is_display)

        # top-frame replay but through a proxy, redirect to original
        if is_top_frame and 'wsgiprox.proxy_host' in request.environ:
            kwargs['url'] = wb_url_obj.url
            kwargs['request_ts'] = wb_url_obj.timestamp
            self.browser_mgr.update_local_browser(kwargs)
            return redirect(wb_url_obj.url)

        try:
            self.check_if_content(wb_url_obj, request.environ, is_top_frame)

            request.environ['pywb.static_prefix'] = self.BUNDLE_PREFIX

            # BEGIN PERMA CUSTOMIZATIONS
            try:
                resp = self.render_content(wb_url, kwargs, request.environ)
            except UpstreamException as ue:
                if ue.status_code == 404:
                    # Retry all 404s after 1s, in a broad effort to allay
                    # https://github.com/harvard-lil/perma/issues/2633
                    # until I pinpoint the real problem.
                    time.sleep(self.sleep_on_404)
                    resp = self.render_content(wb_url, kwargs, request.environ)
                else:
                    raise
            # END PERMA CUSTOMIZATIONS

            if frontend_cache_header:
                resp.status_headers.headers.append(frontend_cache_header)

            # BEGIN PERMA CUSTOMIZATIONS
            # make sure all headers can be encoded in ascii
            # workaround for https://github.com/harvard-lil/perma/issues/2603
            # technique adapted from https://github.com/webrecorder/warcio/blob/master/warcio/statusandheaders.py#L183
            headers = resp.status_headers.headers
            for i, header in enumerate(headers[:]):
                try:
                    header[1].encode('ascii')
                except UnicodeEncodeError:
                    escaped = ENCODE_HEADER_RX.sub(do_encode, header[1])
                    if escaped == header[1]:
                        escaped = quote(escaped)
                    headers[i] = (header[0], escaped)

            resp = HTTPResponse(body=resp.body,
                                status=resp.status_headers.statusline,
                                headers=headers)
            # END PERMA CUSTOMIZATIONS

            return resp

        except UpstreamException as ue:
            err_context = {
                'url': ue.url,
                'status': ue.status_code,
                'error': ue.msg.get('error'),
                'timestamp': wb_url_obj.timestamp if wb_url_obj else '',
                'user': user,
                'coll': coll_name,
                'rec': rec_name,
                'type': type,
                'app_host': self.app_host,
            }

            @self.jinja2_view('content_error.html')
            def handle_error(error):
                response.status = ue.status_code
                return error

            if self.content_error_redirect:
                return redirect(self.content_error_redirect + '?' +
                                urlencode(err_context),
                                code=307)
            else:
                return handle_error(err_context)
Пример #22
0
    def process_record(self, record, flow):
        headers = flow.response.headers
        url = flow.request.req_url
        scheme = flow.request.req_scheme

        if not self.content_rewriter:
            return record.http_headers, StreamIO(record.raw_stream)

        cookie_rewriter = None

        template_params = flow.extra_data

        environ = {
            'pywb_proxy_magic': self.proxy_magic,
            'webrec.template_params': template_params
        }

        wb_url = WbUrl(url)
        wb_prefix = ''
        host_prefix = flow.request.req_scheme + '://' + self.proxy_magic
        urlrewriter = SchemeOnlyUrlRewriter(wb_url, '')

        if flow.request.headers.get('X-Requested-With',
                                    '').lower() == 'xmlhttprequest':
            urlrewriter.rewrite_opts['is_ajax'] = True

        head_insert_func = (self.head_insert_view.create_insert_func(
            wb_url, wb_prefix, host_prefix, url, environ, False))

        urlkey = canonicalize(wb_url.url)

        cdx = CDXObject()
        cdx['urlkey'] = urlkey
        cdx['timestamp'] = http_date_to_timestamp(
            headers.get('Memento-Datetime'))
        cdx['url'] = wb_url.url
        if headers.get('Webagg-Source-Coll') == 'live':
            cdx['is_live'] = 'true'

        result = self.content_rewriter.rewrite_content(
            urlrewriter, record.http_headers, record.raw_stream,
            head_insert_func, urlkey, cdx, cookie_rewriter, environ)

        status_headers, gen, is_rw = result

        status_headers.remove_header('Content-Security-Policy')

        # check for content-length
        res = status_headers.get_header('content-length')
        try:
            if int(res) > 0:
                return status_headers, IterIdent(gen)
        except:
            pass

        # need to either chunk or buffer to get content-length
        if flow.request.http_version == 'HTTP/1.1':
            status_headers.remove_header('content-length')
            status_headers.headers.append(('Transfer-Encoding', 'chunked'))
            #gen = chunk_encode_iter(gen)
        else:
            gen = buffer_iter(status_headers, gen)

        return status_headers, IterIdent(gen)
Пример #23
0
    def render_content(self, wb_url, kwargs, environ):
        wb_url = WbUrl(wb_url)

        host_prefix = self.get_host_prefix(environ)
        rel_prefix = self.get_rel_prefix(environ)
        full_prefix = host_prefix + rel_prefix

        resp = self.handle_custom_response(environ, wb_url,
                                           full_prefix, host_prefix, kwargs)
        if resp is not None:
            content_type = 'text/html'

            # if not replay outer frame, specify utf-8 charset
            if not self.is_framed_replay(wb_url):
                content_type += '; charset=utf-8'

            return WbResponse.text_response(resp, content_type=content_type)

        urlrewriter = UrlRewriter(wb_url,
                                  prefix=full_prefix,
                                  full_prefix=full_prefix,
                                  rel_prefix=rel_prefix)

        self.unrewrite_referrer(environ)

        urlkey = canonicalize(wb_url.url)

        inputreq = RewriteInputRequest(environ, urlkey, wb_url.url,
                                       self.content_rewriter)

        inputreq.include_post_query(wb_url.url)

        mod_url = None
        use_206 = False
        rangeres = None

        readd_range = False
        async_record_url = None

        if kwargs.get('type') in ('record', 'patch'):
            rangeres = inputreq.extract_range()

            if rangeres:
                mod_url, start, end, use_206 = rangeres

                # if bytes=0- Range request,
                # simply remove the range and still proxy
                if start == 0 and not end and use_206:
                    wb_url.url = mod_url
                    inputreq.url = mod_url

                    del environ['HTTP_RANGE']
                    readd_range = True
                else:
                    async_record_url = mod_url

        skip = async_record_url is not None

        setcookie_headers = None
        if self.cookie_tracker:
            cookie_key = self.get_cookie_key(kwargs)
            res = self.cookie_tracker.get_cookie_headers(wb_url.url, cookie_key)
            inputreq.extra_cookie, setcookie_headers = res

        r = self._do_req(inputreq, wb_url, kwargs, skip)

        if r.status_code >= 400:
            error = None
            try:
                error = r.raw.read()
                r.raw.close()
            except:
                pass

            if error:
                error = error.decode('utf-8')
            else:
                error = ''

            details = dict(args=kwargs, error=error)
            raise UpstreamException(r.status_code, url=wb_url.url, details=details)

        if async_record_url:
            environ.pop('HTTP_RANGE', '')
            gevent.spawn(self._do_async_req,
                         inputreq,
                         async_record_url,
                         wb_url,
                         kwargs,
                         False)

        record = self.loader.parse_record_stream(r.raw)

        cdx = CDXObject()
        cdx['urlkey'] = urlkey
        cdx['timestamp'] = http_date_to_timestamp(r.headers.get('Memento-Datetime'))
        cdx['url'] = wb_url.url

        self._add_custom_params(cdx, r.headers, kwargs)

        if readd_range:
            content_length = (record.status_headers.
                              get_header('Content-Length'))
            try:
                content_length = int(content_length)
                record.status_headers.add_range(0, content_length,
                                                   content_length)
            except (ValueError, TypeError):
                pass

        if self.is_ajax(environ):
            head_insert_func = None
            urlrewriter.rewrite_opts['is_ajax'] = True
        else:
            top_url = self.get_top_url(full_prefix, wb_url, cdx, kwargs)
            head_insert_func = (self.head_insert_view.
                                    create_insert_func(wb_url,
                                                       full_prefix,
                                                       host_prefix,
                                                       top_url,
                                                       environ,
                                                       self.framed_replay))

        cookie_rewriter = None
        if self.cookie_tracker:
            cookie_rewriter = self.cookie_tracker.get_rewriter(urlrewriter,
                                                               cookie_key)

        result = self.content_rewriter.rewrite_content(urlrewriter,
                                               record.status_headers,
                                               record.stream,
                                               head_insert_func,
                                               urlkey,
                                               cdx,
                                               cookie_rewriter,
                                               environ)

        status_headers, gen, is_rw = result

        if setcookie_headers:
            status_headers.headers.extend(setcookie_headers)

        return WbResponse(status_headers, gen)
Пример #24
0
    def render_content(self, wb_url, kwargs, environ):
        wb_url = wb_url.replace('#', '%23')
        wb_url = WbUrl(wb_url)

        proto = environ.get('HTTP_X_FORWARDED_PROTO', self.force_scheme)

        if proto:
            environ['wsgi.url_scheme'] = proto

        history_page = environ.pop('HTTP_X_WOMBAT_HISTORY_PAGE', '')
        if history_page:
            wb_url.url = history_page
            is_ajax = True
        else:
            is_ajax = self.is_ajax(environ)

        is_timegate = self._check_accept_dt(wb_url, environ)

        host_prefix = self.get_host_prefix(environ)
        rel_prefix = self.get_rel_prefix(environ)
        full_prefix = host_prefix + rel_prefix
        environ['pywb.host_prefix'] = host_prefix
        pywb_static_prefix = host_prefix + environ.get(
            'pywb.app_prefix', '') + environ.get('pywb.static_prefix',
                                                 '/static/')
        is_proxy = ('wsgiprox.proxy_host' in environ)

        response = self.handle_custom_response(environ, wb_url, full_prefix,
                                               host_prefix, kwargs)

        if response:
            return self.format_response(response, wb_url, full_prefix,
                                        is_timegate, is_proxy)

        if is_proxy:
            environ['pywb_proxy_magic'] = environ['wsgiprox.proxy_host']
            urlrewriter = IdentityUrlRewriter(wb_url, '')
            framed_replay = False

        else:
            urlrewriter = UrlRewriter(wb_url,
                                      prefix=full_prefix,
                                      full_prefix=full_prefix,
                                      rel_prefix=rel_prefix,
                                      pywb_static_prefix=pywb_static_prefix)

            framed_replay = self.framed_replay

        url_parts = urlsplit(wb_url.url)
        if not url_parts.path:
            return self.send_redirect('/', url_parts, urlrewriter)

        self.unrewrite_referrer(environ, full_prefix)

        urlkey = canonicalize(wb_url.url)

        if self.use_js_obj_proxy:
            content_rw = self.js_proxy_rw
        else:
            content_rw = self.default_rw

        inputreq = RewriteInputRequest(environ, urlkey, wb_url.url, content_rw)

        inputreq.include_method_query(wb_url.url)

        range_start, range_end, skip_record = self._check_range(
            inputreq, wb_url)

        setcookie_headers = None
        cookie_key = None
        if self.cookie_tracker:
            cookie_key = self.get_cookie_key(kwargs)
            if cookie_key:
                res = self.cookie_tracker.get_cookie_headers(
                    wb_url.url, urlrewriter, cookie_key,
                    environ.get('HTTP_COOKIE', ''))
                inputreq.extra_cookie, setcookie_headers = res

        r = self._do_req(inputreq, wb_url, kwargs, skip_record)

        if r.status_code >= 400:
            error = None
            try:
                error = r.raw.read()
            except Exception:
                pass
            finally:
                no_except_close(r.raw)

            if error:
                error = error.decode('utf-8')
            else:
                error = ''

            details = dict(args=kwargs, error=error)
            raise UpstreamException(r.status_code,
                                    url=wb_url.url,
                                    details=details)

        cdx = CDXObject(r.headers.get('Warcserver-Cdx').encode('utf-8'))

        cdx_url_parts = urlsplit(cdx['url'])

        if cdx_url_parts.path.endswith(
                '/') and not url_parts.path.endswith('/'):
            # add trailing slash
            new_path = url_parts.path + '/'

            no_except_close(r.raw)

            return self.send_redirect(new_path, url_parts, urlrewriter)

        stream = BufferedReader(r.raw, block_size=BUFF_SIZE)
        record = self.loader.parse_record_stream(stream,
                                                 ensure_http_headers=True)

        memento_dt = r.headers.get('Memento-Datetime')
        target_uri = r.headers.get('WARC-Target-URI')

        # cdx['urlkey'] = urlkey
        # cdx['timestamp'] = http_date_to_timestamp(memento_dt)
        # cdx['url'] = target_uri

        set_content_loc = False

        # Check if Fuzzy Match
        if target_uri != wb_url.url and cdx.get('is_fuzzy') == '1':
            set_content_loc = True

        # if redir to exact, redir if url or ts are different
        if self.redirect_to_exact:
            if (set_content_loc or (wb_url.timestamp != cdx.get('timestamp')
                                    and not cdx.get('is_live'))):

                new_url = urlrewriter.get_new_url(url=target_uri,
                                                  timestamp=cdx['timestamp'],
                                                  mod=wb_url.mod)

                resp = WbResponse.redir_response(new_url,
                                                 '307 Temporary Redirect')
                if self.enable_memento:
                    if is_timegate and not is_proxy:
                        self._add_memento_links(target_uri, full_prefix,
                                                memento_dt, cdx['timestamp'],
                                                resp.status_headers,
                                                is_timegate, is_proxy)

                    else:
                        resp.status_headers['Link'] = MementoUtils.make_link(
                            target_uri, 'original')

                return resp

        self._add_custom_params(cdx, r.headers, kwargs, record)

        if self._add_range(record, wb_url, range_start, range_end):
            wb_url.mod = 'id_'

        if is_ajax:
            head_insert_func = None
            urlrewriter.rewrite_opts['is_ajax'] = True
        else:
            top_url = self.get_top_url(full_prefix, wb_url, cdx, kwargs)
            head_insert_func = (self.head_insert_view.create_insert_func(
                wb_url,
                full_prefix,
                host_prefix,
                top_url,
                environ,
                framed_replay,
                coll=kwargs.get('coll', ''),
                replay_mod=self.replay_mod,
                config=self.config))

        cookie_rewriter = None
        if self.cookie_tracker and cookie_key:
            # skip add cookie if service worker is not 200
            # it seems cookie headers from service workers are not applied, so don't update in cache
            if wb_url.mod == 'sw_':
                cookie_key = None

            cookie_rewriter = self.cookie_tracker.get_rewriter(
                urlrewriter, cookie_key)

        urlrewriter.rewrite_opts['ua_string'] = environ.get('HTTP_USER_AGENT')

        result = content_rw(record, urlrewriter, cookie_rewriter,
                            head_insert_func, cdx, environ)

        status_headers, gen, is_rw = result

        if history_page:
            title = DefaultRewriter._extract_title(gen)
            if not title:
                title = unquote(environ.get('HTTP_X_WOMBAT_HISTORY_TITLE', ''))

            if not title:
                title = history_page

            self._add_history_page(cdx, kwargs, title)
            return WbResponse.json_response({'title': title})

        if setcookie_headers:
            status_headers.headers.extend(setcookie_headers)

        if ' ' not in status_headers.statusline:
            status_headers.statusline += ' None'

        if not is_ajax and self.enable_memento:
            self._add_memento_links(cdx['url'], full_prefix, memento_dt,
                                    cdx['timestamp'], status_headers,
                                    is_timegate, is_proxy,
                                    cdx.get('source-coll'))

            set_content_loc = True

        if set_content_loc and not self.redirect_to_exact:
            status_headers.headers.append(
                ('Content-Location',
                 urlrewriter.get_new_url(timestamp=cdx['timestamp'],
                                         url=cdx['url'])))
        if not is_proxy:
            self.add_csp_header(wb_url, status_headers)

        response = WbResponse(status_headers, gen)

        return response
Пример #25
0
    def render_content(self, wb_url, kwargs, environ):
        wb_url = wb_url.replace('#', '%23')
        wb_url = WbUrl(wb_url)
        is_timegate = self._check_accept_dt(wb_url, environ)

        host_prefix = self.get_host_prefix(environ)
        rel_prefix = self.get_rel_prefix(environ)
        full_prefix = host_prefix + rel_prefix

        is_proxy = ('wsgiprox.proxy_host' in environ)

        response = self.handle_custom_response(environ, wb_url,
                                               full_prefix, host_prefix,
                                               kwargs)

        if response:
            return self.format_response(response, wb_url, full_prefix, is_timegate, is_proxy)

        if is_proxy:
            environ['pywb_proxy_magic'] = environ['wsgiprox.proxy_host']
            urlrewriter = IdentityUrlRewriter(wb_url, '')
            framed_replay = False

        else:
            urlrewriter = UrlRewriter(wb_url,
                                      prefix=full_prefix,
                                      full_prefix=full_prefix,
                                      rel_prefix=rel_prefix)

            framed_replay = self.framed_replay

        url_parts = urlsplit(wb_url.url)
        if not url_parts.path:
            scheme, netloc, path, query, frag = url_parts
            path = '/'
            url = urlunsplit((scheme, netloc, path, query, frag))
            resp = WbResponse.redir_response(urlrewriter.rewrite(url),
                                             '307 Temporary Redirect')

            if self.enable_memento:
                resp.status_headers['Link'] = MementoUtils.make_link(url, 'original')

            return resp

        self.unrewrite_referrer(environ, full_prefix)

        urlkey = canonicalize(wb_url.url)

        if self.use_js_obj_proxy:
            content_rw = self.js_proxy_rw
        else:
            content_rw = self.default_rw

        inputreq = RewriteInputRequest(environ, urlkey, wb_url.url, content_rw)

        inputreq.include_method_query(wb_url.url)

        range_start, range_end, skip_record = self._check_range(inputreq, wb_url)

        setcookie_headers = None
        if self.cookie_tracker:
            cookie_key = self.get_cookie_key(kwargs)
            res = self.cookie_tracker.get_cookie_headers(wb_url.url, urlrewriter, cookie_key)
            inputreq.extra_cookie, setcookie_headers = res

        r = self._do_req(inputreq, wb_url, kwargs, skip_record)

        if r.status_code >= 400:
            error = None
            try:
                error = r.raw.read()
                r.raw.close()
            except:
                pass

            if error:
                error = error.decode('utf-8')
            else:
                error = ''

            details = dict(args=kwargs, error=error)
            raise UpstreamException(r.status_code, url=wb_url.url, details=details)

        stream = BufferedReader(r.raw, block_size=BUFF_SIZE)
        record = self.loader.parse_record_stream(stream,
                                                 ensure_http_headers=True)

        memento_dt = r.headers.get('Memento-Datetime')
        target_uri = r.headers.get('WARC-Target-URI')

        cdx = CDXObject(r.headers.get('Warcserver-Cdx').encode('utf-8'))

        #cdx['urlkey'] = urlkey
        #cdx['timestamp'] = http_date_to_timestamp(memento_dt)
        #cdx['url'] = target_uri

        set_content_loc = False

        # Check if Fuzzy Match
        if target_uri != wb_url.url and cdx.get('is_fuzzy') == '1':
            set_content_loc = True

        # if redir to exact, redir if url or ts are different
        if self.redirect_to_exact:
            if (set_content_loc or
                (wb_url.timestamp != cdx.get('timestamp') and not cdx.get('is_live'))):

                new_url = urlrewriter.get_new_url(url=target_uri,
                                                  timestamp=cdx['timestamp'],
                                                  mod=wb_url.mod)

                resp = WbResponse.redir_response(new_url, '307 Temporary Redirect')
                if self.enable_memento:
                    if is_timegate and not is_proxy:
                        self._add_memento_links(target_uri, full_prefix,
                                                memento_dt, cdx['timestamp'],
                                                resp.status_headers,
                                                is_timegate, is_proxy)

                    else:
                        resp.status_headers['Link'] = MementoUtils.make_link(target_uri, 'original')

                return resp

        self._add_custom_params(cdx, r.headers, kwargs)

        if self._add_range(record, wb_url, range_start, range_end):
            wb_url.mod = 'id_'

        is_ajax = self.is_ajax(environ)

        if is_ajax:
            head_insert_func = None
            urlrewriter.rewrite_opts['is_ajax'] = True
        else:
            top_url = self.get_top_url(full_prefix, wb_url, cdx, kwargs)
            head_insert_func = (self.head_insert_view.
                                    create_insert_func(wb_url,
                                                       full_prefix,
                                                       host_prefix,
                                                       top_url,
                                                       environ,
                                                       framed_replay,
                                                       config=self.config))

        cookie_rewriter = None
        if self.cookie_tracker:
            cookie_rewriter = self.cookie_tracker.get_rewriter(urlrewriter,
                                                               cookie_key)

        urlrewriter.rewrite_opts['ua_string'] = environ.get('HTTP_USER_AGENT')

        result = content_rw(record, urlrewriter, cookie_rewriter, head_insert_func, cdx)

        status_headers, gen, is_rw = result

        if setcookie_headers:
            status_headers.headers.extend(setcookie_headers)

        if ' ' not in status_headers.statusline:
            status_headers.statusline += ' None'

        if not is_ajax and self.enable_memento:
            self._add_memento_links(cdx['url'], full_prefix,
                                    memento_dt, cdx['timestamp'], status_headers,
                                    is_timegate, is_proxy, cdx.get('source-coll'))

            set_content_loc = True

        if set_content_loc and not self.redirect_to_exact:
            status_headers.headers.append(('Content-Location', urlrewriter.get_new_url(timestamp=cdx['timestamp'],
                                                                                       url=cdx['url'])))
        if not is_proxy:
            self.add_csp_header(wb_url, status_headers)

        response = WbResponse(status_headers, gen)

        return response
Пример #26
0
def to_uri_pencode(url):
    return WbUrl(url).get_url()
Пример #27
0
    def handle_routing(self, wb_url, user, coll_name, rec_name, type,
                       is_embed=False,
                       is_display=False,
                       sources='',
                       inv_sources='',
                       redir_route=None):

        wb_url = self._full_url(wb_url)
        if user == '_new' and redir_route:
            return self.do_create_new_and_redir(coll_name, rec_name, wb_url, redir_route)

        sesh = self.get_session()

        remote_ip = None
        frontend_cache_header = None
        patch_recording = None

        the_user, collection, recording = self.user_manager.get_user_coll_rec(user, coll_name, rec_name)

        if not the_user:
            msg = 'not_found' if user == 'api' else 'no_such_user'
            self._raise_error(404, msg)

        coll = collection.my_id if collection else None
        rec = recording.my_id if recording else None

        if type in self.MODIFY_MODES:
            if sesh.is_new() and self.is_content_request():
                self.redir_set_session()

            if not recording:
                self._redir_if_sanitized(self.sanitize_title(rec_name),
                                         rec_name,
                                         wb_url)

                # don't auto create recording for inner frame w/o accessing outer frame
                self._raise_error(404, 'no_such_recording')

            elif not recording.is_open():
                # force creation of new recording as this one is closed
                self._raise_error(400, 'recording_not_open')

            collection.access.assert_can_write_coll(collection)

            if the_user.is_out_of_space():
                self._raise_error(402, 'out_of_space')

            remote_ip = self._get_remote_ip()

            remote_ip = self.check_rate_limit(the_user, remote_ip)

            if inv_sources and inv_sources != '*':
                #patch_rec_name = self.patch_of_name(rec, True)
                patch_recording = recording.get_patch_recording()
                #patch_recording = collection.get_recording_by_name(patch_rec_name)

        if type in ('replay-coll', 'replay'):
            if not collection:
                self._redir_if_sanitized(self.sanitize_title(coll_name),
                                         coll_name,
                                         wb_url)

                if sesh.is_new() and self.is_content_request():
                    self.redir_set_session()
                else:
                    self._raise_error(404, 'no_such_collection')

            access = self.access.check_read_access_public(collection)

            if not access:
                if sesh.is_new() and self.is_content_request():
                    self.redir_set_session()
                else:
                    self._raise_error(404, 'no_such_collection')

            if access != 'public':
                frontend_cache_header = ('Cache-Control', 'private')

            if type == 'replay':
                if not recording:
                    self._raise_error(404, 'no_such_recording')

        request.environ['SCRIPT_NAME'] = quote(request.environ['SCRIPT_NAME'], safe='/:')

        wb_url = self._context_massage(wb_url)

        wb_url_obj = WbUrl(wb_url)

        is_top_frame = (wb_url_obj.mod == self.frame_mod or wb_url_obj.mod.startswith('$br:'))

        if type == 'record' and is_top_frame:
            result = self.check_remote_archive(wb_url, type, wb_url_obj)
            if result:
                mode, wb_url = result
                new_url = '/{user}/{coll}/{rec}/{mode}/{url}'.format(user=user,
                                                                     coll=coll_name,
                                                                     rec=rec_name,
                                                                     mode=mode,
                                                                     url=wb_url)
                return self.redirect(new_url)

        elif type == 'replay-coll' and not is_top_frame:
            collection.sync_coll_index(exists=False, do_async=False)

        kwargs = dict(user=user,
                      id=sesh.get_id(),
                      coll=coll,
                      rec=rec,
                      coll_name=quote(coll_name),
                      rec_name=quote(rec_name, safe='/*'),

                      the_user=the_user,
                      collection=collection,
                      recording=recording,
                      patch_recording=patch_recording,

                      type=type,
                      sources=sources,
                      inv_sources=inv_sources,
                      patch_rec=patch_recording.my_id if patch_recording else None,
                      ip=remote_ip,
                      is_embed=is_embed,
                      is_display=is_display)

        # top-frame replay but through a proxy, redirect to original
        if is_top_frame and 'wsgiprox.proxy_host' in request.environ:
            kwargs['url'] = wb_url_obj.url
            kwargs['request_ts'] = wb_url_obj.timestamp
            self.browser_mgr.update_local_browser(kwargs)
            return redirect(wb_url_obj.url)

        try:
            self.check_if_content(wb_url_obj, request.environ, is_top_frame)

            resp = self.render_content(wb_url, kwargs, request.environ)

            if frontend_cache_header:
                resp.status_headers.headers.append(frontend_cache_header)

            resp = HTTPResponse(body=resp.body,
                                status=resp.status_headers.statusline,
                                headers=resp.status_headers.headers)

            return resp

        except UpstreamException as ue:
            err_context = {
                'url': ue.url,
                'status': ue.status_code,
                'error': ue.msg.get('error'),
                'timestamp': wb_url_obj.timestamp if wb_url_obj else '',
                'user': user,
                'coll': coll_name,
                'rec': rec_name,
                'type': type,
                'app_host': self.app_host,
            }

            @self.jinja2_view('content_error.html')
            def handle_error(error):
                response.status = ue.status_code
                return error

            if self.content_error_redirect:
                return redirect(self.content_error_redirect + '?' + urlencode(err_context), code=307)
            else:
                return handle_error(err_context)
Пример #28
0
    def render_content(self, wb_url, kwargs, environ):
        wb_url = wb_url.replace('#', '%23')
        wb_url = WbUrl(wb_url)

        history_page = environ.pop('HTTP_X_WOMBAT_HISTORY_PAGE', '')
        if history_page:
            wb_url.url = history_page
            is_ajax = True
        else:
            is_ajax = self.is_ajax(environ)

        is_timegate = self._check_accept_dt(wb_url, environ)

        self.prepare_env(environ)

        host_prefix = environ['pywb.host_prefix']
        rel_prefix = self.get_rel_prefix(environ)
        full_prefix = host_prefix + rel_prefix

        pywb_static_prefix = environ['pywb.static_prefix'] + '/'
        is_proxy = ('wsgiprox.proxy_host' in environ)

        # if OPTIONS in proxy mode, just generate the proxy responss
        if is_proxy and self.is_preflight(environ):
            return WbResponse.options_response(environ)

        if self.use_js_obj_proxy:
            content_rw = self.js_proxy_rw
        else:
            content_rw = self.default_rw

        # no redirects if in proxy
        redirect_to_exact = self.redirect_to_exact and not is_proxy

        # Check Prefer
        pref_mod, pref_applied = self._get_prefer_mod(wb_url, environ,
                                                      content_rw, is_proxy)

        response = None
        keep_frame_response = False

        # prefer overrides custom response?
        if pref_mod is not None:
            # fast-redirect to preferred
            if redirect_to_exact and not is_timegate and pref_mod != wb_url.mod:
                new_url = full_prefix + wb_url.to_str(mod=pref_mod)
                headers = [('Preference-Applied', pref_applied),
                           ('Vary', 'Prefer')]

                return WbResponse.redir_response(new_url,
                                                 '307 Temporary Redirect',
                                                 headers=headers)
            else:
                wb_url.mod = pref_mod
        else:
            if kwargs.get('output'):
                response = self.handle_timemap(wb_url, kwargs, full_prefix)

            elif wb_url.is_query():
                response = self.handle_query(environ, wb_url, kwargs,
                                             full_prefix)

            else:
                response = self.handle_custom_response(environ, wb_url,
                                                       full_prefix,
                                                       host_prefix, kwargs)

                keep_frame_response = (not kwargs.get('no_timegate_check')
                                       and is_timegate
                                       and not is_proxy) or redirect_to_exact

        if response and not keep_frame_response:
            return self.format_response(response, wb_url, full_prefix,
                                        is_timegate, is_proxy)

        if is_proxy:
            environ['pywb_proxy_magic'] = environ['wsgiprox.proxy_host']
            urlrewriter = IdentityUrlRewriter(wb_url, '')
            framed_replay = False

        else:
            urlrewriter = UrlRewriter(wb_url,
                                      prefix=full_prefix,
                                      full_prefix=full_prefix,
                                      rel_prefix=rel_prefix,
                                      pywb_static_prefix=pywb_static_prefix)

            framed_replay = self.framed_replay

        url_parts = urlsplit(wb_url.url)
        if not url_parts.path:
            return self.send_redirect('/', url_parts, urlrewriter)

        self.unrewrite_referrer(environ, full_prefix)

        urlkey = canonicalize(wb_url.url)

        inputreq = RewriteInputRequest(environ, urlkey, wb_url.url, content_rw)

        inputreq.include_method_query(wb_url.url)

        range_start, range_end, skip_record = self._check_range(
            inputreq, wb_url)

        setcookie_headers = None
        cookie_key = None
        if self.cookie_tracker:
            cookie_key = self.get_cookie_key(kwargs)
            if cookie_key:
                res = self.cookie_tracker.get_cookie_headers(
                    wb_url.url, urlrewriter, cookie_key,
                    environ.get('HTTP_COOKIE', ''))
                inputreq.extra_cookie, setcookie_headers = res

        r = self._do_req(inputreq, wb_url, kwargs, skip_record)

        if r.status_code >= 400:
            error = None
            try:
                error = r.raw.read()
            except Exception:
                pass
            finally:
                no_except_close(r.raw)

            if error:
                error = error.decode('utf-8')
            else:
                error = ''

            details = dict(args=kwargs, error=error)
            if r.status_code == 404:
                raise NotFoundException(url=wb_url.url, msg=details)

            else:
                raise UpstreamException(r.status_code,
                                        url=wb_url.url,
                                        details=details)

        cdx = CDXObject(r.headers.get('Warcserver-Cdx').encode('utf-8'))

        cdx_url_parts = urlsplit(cdx['url'])

        if cdx_url_parts.path.endswith(
                '/') and not url_parts.path.endswith('/'):
            # add trailing slash
            new_path = url_parts.path + '/'

            no_except_close(r.raw)

            return self.send_redirect(new_path, url_parts, urlrewriter)

        # only redirect to exact if not live, otherwise set to false
        redirect_to_exact = redirect_to_exact and not cdx.get('is_live')

        # return top-frame timegate response, with timestamp from cdx
        if response and keep_frame_response and (not redirect_to_exact
                                                 or not is_timegate):
            no_except_close(r.raw)
            return self.format_response(response, wb_url, full_prefix,
                                        is_timegate, is_proxy,
                                        cdx['timestamp'])

        stream = BufferedReader(r.raw, block_size=BUFF_SIZE)
        record = self.loader.parse_record_stream(stream,
                                                 ensure_http_headers=True)

        memento_dt = r.headers.get('Memento-Datetime')
        target_uri = r.headers.get('WARC-Target-URI')

        # cdx['urlkey'] = urlkey
        # cdx['timestamp'] = http_date_to_timestamp(memento_dt)
        # cdx['url'] = target_uri

        set_content_loc = False

        # Check if Fuzzy Match
        if target_uri != wb_url.url and cdx.get('is_fuzzy') == '1':
            set_content_loc = True

        # if redirect to exact timestamp (only set if not live)
        if redirect_to_exact:
            if set_content_loc or is_timegate or wb_url.timestamp != cdx.get(
                    'timestamp'):
                new_url = urlrewriter.get_new_url(url=target_uri,
                                                  timestamp=cdx['timestamp'],
                                                  mod=wb_url.mod)

                resp = WbResponse.redir_response(new_url,
                                                 '307 Temporary Redirect')
                if self.enable_memento:
                    if is_timegate and not is_proxy:
                        self._add_memento_links(target_uri,
                                                full_prefix,
                                                memento_dt,
                                                cdx['timestamp'],
                                                resp.status_headers,
                                                is_timegate,
                                                is_proxy,
                                                pref_applied=pref_applied,
                                                mod=pref_mod,
                                                is_memento=False)

                    else:
                        resp.status_headers['Link'] = MementoUtils.make_link(
                            target_uri, 'original')

                return resp

        self._add_custom_params(cdx, r.headers, kwargs, record)

        if self._add_range(record, wb_url, range_start, range_end):
            wb_url.mod = 'id_'

        if is_ajax:
            head_insert_func = None
            urlrewriter.rewrite_opts['is_ajax'] = True
        else:
            top_url = self.get_top_url(full_prefix, wb_url, cdx, kwargs)
            head_insert_func = (self.head_insert_view.create_insert_func(
                wb_url,
                full_prefix,
                host_prefix,
                top_url,
                environ,
                framed_replay,
                coll=kwargs.get('coll', ''),
                replay_mod=self.replay_mod,
                metadata=kwargs.get('metadata', {}),
                config=self.config))

        cookie_rewriter = None
        if self.cookie_tracker and cookie_key:
            # skip add cookie if service worker is not 200
            # it seems cookie headers from service workers are not applied, so don't update in cache
            if wb_url.mod == 'sw_':
                cookie_key = None

            cookie_rewriter = self.cookie_tracker.get_rewriter(
                urlrewriter, cookie_key)

        urlrewriter.rewrite_opts['ua_string'] = environ.get('HTTP_USER_AGENT')

        result = content_rw(record, urlrewriter, cookie_rewriter,
                            head_insert_func, cdx, environ)

        status_headers, gen, is_rw = result

        if history_page:
            title = DefaultRewriter._extract_title(gen)
            if not title:
                title = unquote(environ.get('HTTP_X_WOMBAT_HISTORY_TITLE', ''))

            if not title:
                title = history_page

            self._add_history_page(cdx, kwargs, title)
            return WbResponse.json_response({'title': title})

        if setcookie_headers:
            status_headers.headers.extend(setcookie_headers)

        if ' ' not in status_headers.statusline:
            status_headers.statusline += ' None'

        if not is_ajax and self.enable_memento:
            self._add_memento_links(cdx['url'],
                                    full_prefix,
                                    memento_dt,
                                    cdx['timestamp'],
                                    status_headers,
                                    is_timegate,
                                    is_proxy,
                                    cdx.get('source-coll'),
                                    mod=pref_mod,
                                    pref_applied=pref_applied)

            set_content_loc = True

        if set_content_loc and not redirect_to_exact and not is_proxy:
            status_headers.headers.append(
                ('Content-Location',
                 urlrewriter.get_new_url(timestamp=cdx['timestamp'],
                                         url=cdx['url'])))

        if not is_proxy:
            self.add_csp_header(wb_url, status_headers)

        response = WbResponse(status_headers, gen)

        if is_proxy and environ.get('HTTP_ORIGIN'):
            response.add_access_control_headers(environ)

        if r.status_code == 200 and kwargs.get(
                'cache') == 'always' and environ.get('HTTP_REFERER'):
            response.status_headers[
                'Cache-Control'] = 'public, max-age=31536000, immutable'

        return response
Пример #29
0
 def to_str(self, **overrides):
     overrides['mod'] = ''
     overrides['timestamp'] = ''
     return WbUrl.to_str(self, **overrides)
Пример #30
0
    def handle_magic_page(self, env):
        request_url = env['REL_REQUEST_URI']
        parts = urlsplit(request_url)
        server_name = env['pywb.proxy_host']

        path_url = parts.path[1:]
        if parts.query:
            path_url += '?' + parts.query

        if server_name.startswith('auto'):
            coll, ts, sesh_id = self.get_coll(env)

            if coll:
                return self.make_sethost_cookie_response(sesh_id,
                                                         path_url,
                                                         env)
            else:
                return self.make_magic_response('select', path_url, env)

        elif server_name.startswith('query.'):
            wb_url = WbUrl(path_url)

            # only dealing with specific timestamp setting
            if wb_url.is_query():
                return None

            coll, ts, sesh_id = self.get_coll(env)
            if not coll:
                return self.make_magic_response('select', path_url, env)

            self.set_ts(sesh_id, wb_url.timestamp)
            return self.make_redir_response(wb_url.url)

        elif server_name.endswith(self.set_prefix):
            old_sesh_id = extract_client_cookie(env, self.cookie_name)
            sesh_id = self.create_renew_sesh_id(old_sesh_id)

            if sesh_id != old_sesh_id:
                headers = self.make_cookie_headers(sesh_id, self.magic_name)
            else:
                headers = None

            coll = server_name[:-len(self.set_prefix)]

            # set sesh value
            self.set_coll(sesh_id, coll)

            return self.make_sethost_cookie_response(sesh_id, path_url, env,
                                                     headers=headers)

        elif self.sethost_prefix in server_name:
            inx = server_name.find(self.sethost_prefix)
            sesh_id = server_name[:inx]

            domain = server_name[inx + len(self.sethost_prefix):]

            headers = self.make_cookie_headers(sesh_id, domain)

            full_url = env['pywb.proxy_scheme'] + '://' + domain
            full_url += '/' + path_url
            return self.make_redir_response(full_url, headers=headers)

        elif 'select.' in server_name:
            coll, ts, sesh_id = self.get_coll(env)

            route_temp = '-set.' + self.magic_name + '/' + path_url

            return (self.proxy_select_view.
                    render_response(routes=self.routes,
                                    route_temp=route_temp,
                                    coll=coll,
                                    url=path_url))