예제 #1
0
    def load(self, url, offset=0, length=-1):
        """
        Load a file-like reader from the local file system
        """

        # if starting with . or /, can only be a file path..
        file_only = url.startswith(('/', '.'))

        # convert to filename
        filename = from_file_url(url)
        if filename != url:
            file_only = True
            url = filename

        afile = None
        try:
            # first, try as file
            afile = open(url, 'rb')

        except IOError:
            no_except_close(afile)
            if file_only:
                raise

            return super(LocalFileLoader, self).load(url, offset, length)

        if offset > 0:
            afile.seek(offset)

        if length >= 0:
            return LimitReader(afile, length)
        else:
            return afile
예제 #2
0
    def _do_request(self, method, load_url, data, req_headers, params,
                    is_live):
        adapter = DefaultAdapters.live_adapter if is_live else DefaultAdapters.remote_adapter
        max_retries = adapter.max_retries

        if SOCKS_PROXIES:
            conn = adapter.get_connection(load_url, SOCKS_PROXIES)
        else:
            conn = adapter.poolmanager

        upstream_res = None
        try:
            upstream_res = conn.urlopen(method=method,
                                        url=load_url,
                                        body=data,
                                        headers=req_headers,
                                        redirect=False,
                                        assert_same_host=False,
                                        preload_content=False,
                                        decode_content=False,
                                        retries=max_retries,
                                        timeout=params.get('_timeout'))

            return upstream_res

        except Exception as e:
            if upstream_res:
                no_except_close(upstream_res)
            if logger.isEnabledFor(logging.DEBUG):
                import traceback
                traceback.print_exc()
                logger.debug('FAILED: ' + method + ' ' + load_url + ': ' +
                             str(e))

            raise LiveResourceException(load_url)
예제 #3
0
    def handle_timegate(self, params, timestamp):
        url = params['url']
        load_url = self.timegate_url.format(url=url, timestamp=timestamp)

        res = None
        try:
            headers = self._get_headers(params)
            res = self.sesh.head(load_url, headers=headers)
        except Exception as e:
            no_except_close(res)
            raise NotFoundException(url)

        if res and res.headers.get('Memento-Datetime'):
            if res.status_code >= 400:
                no_except_close(res)
                raise NotFoundException(url)

            if res.status_code >= 300:
                info = self._extract_location(url, res.headers.get('Location'))
            else:
                info = self._extract_location(
                    url, res.headers.get('Content-Location'))

            url, timestamp, load_url = info

        cdx = CDXObject()
        cdx['urlkey'] = canonicalize(url)
        cdx['timestamp'] = timestamp
        cdx['url'] = url
        cdx['load_url'] = load_url

        if 'Referer' in headers:
            cdx['set_referrer'] = headers['Referer']

        return iter([cdx])
예제 #4
0
def load_yaml_config(config_file):
    config = None
    configdata = None
    try:
        configdata = load(config_file)
        config = yaml.load(configdata, Loader=yaml.Loader)
    finally:
        no_except_close(configdata)

    return config
 def _close_file(self, fh):
     try:
         if os.name != 'nt':
             portalocker.lock(fh, portalocker.LOCK_UN)
         return True
     except Exception as e:
         print(e)
         return False
     finally:
         no_except_close(fh)
예제 #6
0
    def rewrite_text_stream_to_gen(self, stream, rwinfo):
        """
        Convert stream to generator using applying rewriting func
        to each portion of the stream.
        Align to line boundaries if needed.
        """
        try:
            buff = self.first_buff

            # for html rewriting:
            # if charset is utf-8, use that, otherwise default to encode to ascii-compatible encoding
            # encoding only used for url rewriting, encoding back to bytes after rewriting
            if rwinfo.charset == 'utf-8' and rwinfo.text_type == 'html':
                charset = 'utf-8'
            else:
                charset = 'iso-8859-1'

            if buff:
                yield buff.encode(charset)

            decoder = codecs.getincrementaldecoder(charset)()

            while True:
                buff = stream.read(BUFF_SIZE)
                if not buff:
                    break

                if self.align_to_line:
                    buff += stream.readline()

                try:
                    buff = decoder.decode(buff)
                except UnicodeDecodeError:
                    if charset == 'utf-8':
                        rwinfo.charset = 'iso-8859-1'
                        charset = rwinfo.charset
                        decoder = codecs.getincrementaldecoder(charset)()
                        buff = decoder.decode(buff)

                buff = self.rewrite(buff)

                yield buff.encode(charset)

            # For adding a tail/handling final buffer
            buff = self.final_read()

            # ensure decoder is marked as finished (final buffer already decoded)
            decoder.decode(b'', final=True)

            if buff:
                yield buff.encode(charset)

        finally:
            no_except_close(stream)
예제 #7
0
    def __call__(self, env, start_response):
        """Callable definition to allow WbResponse control over how the response is sent

        :param dict env: The WSGI environment dictionary
        :param function start_response: The WSGI start_response function
        :return: The response body
        """
        start_response(self.status_headers.statusline,
                       self.status_headers.headers)
        request_method = env['REQUEST_METHOD']
        if request_method == 'HEAD' or request_method == 'OPTIONS' or self.status_headers.statusline.startswith('304'):
            no_except_close(self.body)
            return []

        return self.body
예제 #8
0
    def _put_record(self, request_uri, input_buff, record_type, headers,
                    params, start_response):

        if record_type == 'stream':
            if self.writer.write_stream_to_file(params, input_buff):
                msg = {'success': 'true'}
            else:
                msg = {'error_message': 'upload_error'}

            return self.send_message(msg, '200 OK', start_response)

        req_stream = None
        try:
            req_stream = ReqWrapper(input_buff, headers, params,
                                    self.create_buff_func)

            while True:
                buff = req_stream.read()
                if not buff:
                    break

            content_type = headers.get('Content-Type')

            payload_length = req_stream.out.tell()
            req_stream.out.seek(0)

            record = self.writer.create_warc_record(
                uri=params['url'],
                record_type=record_type,
                payload=req_stream.out,
                length=payload_length,
                warc_content_type=content_type,
                warc_headers_dict=req_stream.headers)

            self.writer.write_record(record, params)

            msg = {
                'success': 'true',
                'WARC-Date': record.rec_headers.get_header('WARC-Date')
            }

        finally:
            if req_stream:
                no_except_close(req_stream.out)

        return self.send_message(msg, '200 OK', start_response)
예제 #9
0
    def handle_timemap(self, params):
        url = res_template(self.timemap_url, params)
        headers = self._get_headers(params)
        res = None
        try:
            res = self.sesh.get(url,
                                headers=headers,
                                timeout=params.get('_timeout'))

            res.raise_for_status()

        except Exception as e:
            no_except_close(res)
            self.logger.debug('FAILED: ' + str(e))
            raise NotFoundException(url)

        links = res.text
        return self.links_to_cdxobject(links, 'timemap')
예제 #10
0
    def __call__(self, cdx, failed_files, cdx_loader, *args, **kwargs):
        headers_record, payload_record = self.load_headers_and_payload(
            cdx, failed_files, cdx_loader)

        # Default handling logic when loading http status/headers

        # special case: set header to payload if old-style revisit
        # with missing header
        if not headers_record:
            headers_record = payload_record
        elif headers_record != payload_record:
            # close remainder of stream as this record only used for
            # (already parsed) headers
            no_except_close(headers_record.raw_stream)

            # special case: check if headers record is actually empty
            # (eg empty revisit), then use headers from revisit
            if not headers_record.http_headers.headers:
                headers_record = payload_record

        if not headers_record or not payload_record:
            if headers_record:
                no_except_close(headers_record.raw_stream)
            if payload_record:
                no_except_close(payload_record.raw_stream)
            raise ArchiveLoadFailed('Could not load ' + str(cdx))

        # ensure status line is valid from here
        headers_record.http_headers.validate_statusline('204 No Content')

        return (headers_record.http_headers, payload_record.raw_stream)
예제 #11
0
    def _write_one(self):
        req_pay = None
        resp_pay = None
        try:
            result = self.write_queue.get()

            req_head, req_pay, resp_head, resp_pay, params = result

            resp_length = resp_pay.tell()
            resp_pay.seek(0)
            resp = ArcWarcRecordLoader().parse_record_stream(resp_pay)

            if resp.rec_type == 'response':
                uri = resp.rec_headers.get_header('WARC-Target-Uri')
                req_length = req_pay.tell()
                req_pay.seek(0)
                req = self.writer.create_warc_record(
                    uri=uri,
                    record_type='request',
                    payload=req_pay,
                    length=req_length,
                    warc_headers_dict=req_head)

                self.writer.write_request_response_pair(req, resp, params)

            else:
                self.writer.write_record(resp, params)

        finally:
            try:
                if req_pay:
                    no_except_close(req_pay)

                if resp_pay:
                    no_except_close(resp_pay)
            except Exception as e:
                traceback.print_exc()
예제 #12
0
    def _do_request(self, method, load_url, data, req_headers, params,
                    is_live):
        adapter = DefaultAdapters.live_adapter if is_live else DefaultAdapters.remote_adapter
        max_retries = adapter.max_retries

        # get either the poolmanager or proxy manager to handle this connection
        if self.socks_proxy and not os.environ.get('SOCKS_DISABLE'):
            manager = adapter.proxy_manager_for(self.socks_proxy)
        else:
            manager = adapter.poolmanager

        upstream_res = None
        try:
            upstream_res = manager.urlopen(method=method,
                                           url=load_url,
                                           body=data,
                                           headers=req_headers,
                                           redirect=False,
                                           assert_same_host=False,
                                           preload_content=False,
                                           decode_content=False,
                                           retries=max_retries,
                                           timeout=params.get('_timeout'))

            return upstream_res

        except Exception as e:
            if upstream_res:
                no_except_close(upstream_res)
            if logger.isEnabledFor(logging.DEBUG):
                import traceback
                traceback.print_exc()
                logger.debug('FAILED: ' + method + ' ' + load_url + ': ' +
                             str(e))

            raise LiveResourceException(load_url)
예제 #13
0
    def _write_to_file(self):
        skipping = False
        try:
            if self.interrupted:
                skipping = True

            if not skipping:
                entry = (self.req.headers, self.req.out, self.headers,
                         self.out, self.params)
                self.queue.put(entry)
        except Exception:
            traceback.print_exc()
            skipping = True

        finally:
            if skipping:
                no_except_close(self.out)
                no_except_close(self.req.out)

            no_except_close(self.req)
            self.req = None
예제 #14
0
    def render_content(self, wb_url, kwargs, environ):
        wb_url = wb_url.replace('#', '%23')
        wb_url = WbUrl(wb_url)

        proto = environ.get('HTTP_X_FORWARDED_PROTO', self.force_scheme)

        if proto:
            environ['wsgi.url_scheme'] = proto

        history_page = environ.pop('HTTP_X_WOMBAT_HISTORY_PAGE', '')
        if history_page:
            wb_url.url = history_page
            is_ajax = True
        else:
            is_ajax = self.is_ajax(environ)

        is_timegate = self._check_accept_dt(wb_url, environ)

        host_prefix = self.get_host_prefix(environ)
        rel_prefix = self.get_rel_prefix(environ)
        full_prefix = host_prefix + rel_prefix
        environ['pywb.host_prefix'] = host_prefix
        pywb_static_prefix = host_prefix + environ.get(
            'pywb.app_prefix', '') + environ.get('pywb.static_prefix',
                                                 '/static/')
        is_proxy = ('wsgiprox.proxy_host' in environ)

        response = self.handle_custom_response(environ, wb_url, full_prefix,
                                               host_prefix, kwargs)

        if response:
            return self.format_response(response, wb_url, full_prefix,
                                        is_timegate, is_proxy)

        if is_proxy:
            environ['pywb_proxy_magic'] = environ['wsgiprox.proxy_host']
            urlrewriter = IdentityUrlRewriter(wb_url, '')
            framed_replay = False

        else:
            urlrewriter = UrlRewriter(wb_url,
                                      prefix=full_prefix,
                                      full_prefix=full_prefix,
                                      rel_prefix=rel_prefix,
                                      pywb_static_prefix=pywb_static_prefix)

            framed_replay = self.framed_replay

        url_parts = urlsplit(wb_url.url)
        if not url_parts.path:
            return self.send_redirect('/', url_parts, urlrewriter)

        self.unrewrite_referrer(environ, full_prefix)

        urlkey = canonicalize(wb_url.url)

        if self.use_js_obj_proxy:
            content_rw = self.js_proxy_rw
        else:
            content_rw = self.default_rw

        inputreq = RewriteInputRequest(environ, urlkey, wb_url.url, content_rw)

        inputreq.include_method_query(wb_url.url)

        range_start, range_end, skip_record = self._check_range(
            inputreq, wb_url)

        setcookie_headers = None
        cookie_key = None
        if self.cookie_tracker:
            cookie_key = self.get_cookie_key(kwargs)
            if cookie_key:
                res = self.cookie_tracker.get_cookie_headers(
                    wb_url.url, urlrewriter, cookie_key,
                    environ.get('HTTP_COOKIE', ''))
                inputreq.extra_cookie, setcookie_headers = res

        r = self._do_req(inputreq, wb_url, kwargs, skip_record)

        if r.status_code >= 400:
            error = None
            try:
                error = r.raw.read()
            except Exception:
                pass
            finally:
                no_except_close(r.raw)

            if error:
                error = error.decode('utf-8')
            else:
                error = ''

            details = dict(args=kwargs, error=error)
            raise UpstreamException(r.status_code,
                                    url=wb_url.url,
                                    details=details)

        cdx = CDXObject(r.headers.get('Warcserver-Cdx').encode('utf-8'))

        cdx_url_parts = urlsplit(cdx['url'])

        if cdx_url_parts.path.endswith(
                '/') and not url_parts.path.endswith('/'):
            # add trailing slash
            new_path = url_parts.path + '/'

            no_except_close(r.raw)

            return self.send_redirect(new_path, url_parts, urlrewriter)

        stream = BufferedReader(r.raw, block_size=BUFF_SIZE)
        record = self.loader.parse_record_stream(stream,
                                                 ensure_http_headers=True)

        memento_dt = r.headers.get('Memento-Datetime')
        target_uri = r.headers.get('WARC-Target-URI')

        # cdx['urlkey'] = urlkey
        # cdx['timestamp'] = http_date_to_timestamp(memento_dt)
        # cdx['url'] = target_uri

        set_content_loc = False

        # Check if Fuzzy Match
        if target_uri != wb_url.url and cdx.get('is_fuzzy') == '1':
            set_content_loc = True

        # if redir to exact, redir if url or ts are different
        if self.redirect_to_exact:
            if (set_content_loc or (wb_url.timestamp != cdx.get('timestamp')
                                    and not cdx.get('is_live'))):

                new_url = urlrewriter.get_new_url(url=target_uri,
                                                  timestamp=cdx['timestamp'],
                                                  mod=wb_url.mod)

                resp = WbResponse.redir_response(new_url,
                                                 '307 Temporary Redirect')
                if self.enable_memento:
                    if is_timegate and not is_proxy:
                        self._add_memento_links(target_uri, full_prefix,
                                                memento_dt, cdx['timestamp'],
                                                resp.status_headers,
                                                is_timegate, is_proxy)

                    else:
                        resp.status_headers['Link'] = MementoUtils.make_link(
                            target_uri, 'original')

                return resp

        self._add_custom_params(cdx, r.headers, kwargs, record)

        if self._add_range(record, wb_url, range_start, range_end):
            wb_url.mod = 'id_'

        if is_ajax:
            head_insert_func = None
            urlrewriter.rewrite_opts['is_ajax'] = True
        else:
            top_url = self.get_top_url(full_prefix, wb_url, cdx, kwargs)
            head_insert_func = (self.head_insert_view.create_insert_func(
                wb_url,
                full_prefix,
                host_prefix,
                top_url,
                environ,
                framed_replay,
                coll=kwargs.get('coll', ''),
                replay_mod=self.replay_mod,
                config=self.config))

        cookie_rewriter = None
        if self.cookie_tracker and cookie_key:
            # skip add cookie if service worker is not 200
            # it seems cookie headers from service workers are not applied, so don't update in cache
            if wb_url.mod == 'sw_':
                cookie_key = None

            cookie_rewriter = self.cookie_tracker.get_rewriter(
                urlrewriter, cookie_key)

        urlrewriter.rewrite_opts['ua_string'] = environ.get('HTTP_USER_AGENT')

        result = content_rw(record, urlrewriter, cookie_rewriter,
                            head_insert_func, cdx, environ)

        status_headers, gen, is_rw = result

        if history_page:
            title = DefaultRewriter._extract_title(gen)
            if not title:
                title = unquote(environ.get('HTTP_X_WOMBAT_HISTORY_TITLE', ''))

            if not title:
                title = history_page

            self._add_history_page(cdx, kwargs, title)
            return WbResponse.json_response({'title': title})

        if setcookie_headers:
            status_headers.headers.extend(setcookie_headers)

        if ' ' not in status_headers.statusline:
            status_headers.statusline += ' None'

        if not is_ajax and self.enable_memento:
            self._add_memento_links(cdx['url'], full_prefix, memento_dt,
                                    cdx['timestamp'], status_headers,
                                    is_timegate, is_proxy,
                                    cdx.get('source-coll'))

            set_content_loc = True

        if set_content_loc and not self.redirect_to_exact:
            status_headers.headers.append(
                ('Content-Location',
                 urlrewriter.get_new_url(timestamp=cdx['timestamp'],
                                         url=cdx['url'])))
        if not is_proxy:
            self.add_csp_header(wb_url, status_headers)

        response = WbResponse(status_headers, gen)

        return response
예제 #15
0
    def load_resource(self, cdx, params):
        if cdx.get('_cached_result'):
            return cdx.get('_cached_result')

        if not cdx.get('filename') or cdx.get('offset') is None:
            return None

        orig_source = cdx.get('source', '').split(':')[0]
        formatter = ParamFormatter(params, orig_source)
        cdx._formatter = formatter

        def local_index_query(local_params):
            for n, v in six.iteritems(params):
                if n.startswith('param.'):
                    local_params[n] = v

            cdx_iter, errs = self.cdx_source(local_params)
            for cdx in cdx_iter:
                cdx._formatter = formatter
                yield cdx

        failed_files = []
        headers, payload = (self.resolve_loader.load_headers_and_payload(
            cdx, failed_files, local_index_query))

        http_headers_buff = None
        if payload.rec_type in ('response', 'revisit'):
            status = cdx.get('status')
            # status may not be set for 'revisit'
            if not status or status.startswith('3'):
                http_headers = self.headers_parser.parse(payload.raw_stream)

                try:
                    self.raise_on_self_redirect(
                        params, cdx, http_headers.get_statuscode(),
                        http_headers.get_header('Location'))
                except LiveResourceException:
                    no_except_close(headers.raw_stream)
                    no_except_close(payload.raw_stream)
                    raise

                http_headers_buff = http_headers.to_bytes()

        warc_headers = payload.rec_headers

        if headers != payload:
            warc_headers.replace_header(
                'WARC-Refers-To-Target-URI',
                payload.rec_headers.get_header('WARC-Target-URI'))

            warc_headers.replace_header(
                'WARC-Refers-To-Date',
                payload.rec_headers.get_header('WARC-Date'))

            warc_headers.replace_header(
                'WARC-Target-URI',
                headers.rec_headers.get_header('WARC-Target-URI'))

            warc_headers.replace_header(
                'WARC-Date', headers.rec_headers.get_header('WARC-Date'))
            no_except_close(headers.raw_stream)

        return (warc_headers, http_headers_buff, payload.raw_stream)
예제 #16
0
    def load_resource(self, cdx, params):
        if cdx.get('_cached_result'):
            return cdx.get('_cached_result')

        if not cdx.get('filename') or cdx.get('offset') is None:
            return None

        orig_source = cdx.get('source', '').split(':')[0]
        formatter = ParamFormatter(params, orig_source)
        cdx._formatter = formatter

        def local_index_query(local_params):
            for n, v in six.iteritems(params):
                if n.startswith('param.'):
                    local_params[n] = v

            cdx_iter, errs = self.cdx_source(local_params)
            for cdx in cdx_iter:
                cdx._formatter = formatter
                yield cdx

        failed_files = []
        headers, payload = (self.resolve_loader.load_headers_and_payload(
            cdx, failed_files, local_index_query))

        http_headers_buff = None
        if payload.rec_type in ('response', 'revisit'):
            status = cdx.get('status')

            # if status is not set and not, 2xx, 4xx, 5xx
            # go through self-redirect check just in case
            if not status or not status.startswith(('2', '4', '5')):
                http_headers = self.headers_parser.parse(payload.raw_stream)
                try:
                    orig_size = payload.raw_stream.tell()
                except:
                    orig_size = 0

                try:
                    self.raise_on_self_redirect(
                        params, cdx, http_headers.get_statuscode(),
                        http_headers.get_header('Location'))
                except LiveResourceException:
                    no_except_close(headers.raw_stream)
                    no_except_close(payload.raw_stream)
                    raise

                http_headers_buff = http_headers.to_bytes()

                # if new http_headers_buff is different length,
                # attempt to adjust content-length on the WARC record
                if orig_size and len(http_headers_buff) != orig_size:
                    orig_cl = payload.rec_headers.get_header('Content-Length')
                    if orig_cl:
                        new_cl = int(orig_cl) + (len(http_headers_buff) -
                                                 orig_size)
                        payload.rec_headers.replace_header(
                            'Content-Length', str(new_cl))

        warc_headers = payload.rec_headers

        if headers != payload:
            warc_headers.replace_header(
                'WARC-Refers-To-Target-URI',
                payload.rec_headers.get_header('WARC-Target-URI'))

            warc_headers.replace_header(
                'WARC-Refers-To-Date',
                payload.rec_headers.get_header('WARC-Date'))

            warc_headers.replace_header(
                'WARC-Target-URI',
                headers.rec_headers.get_header('WARC-Target-URI'))

            warc_headers.replace_header(
                'WARC-Date', headers.rec_headers.get_header('WARC-Date'))
            no_except_close(headers.raw_stream)

        return (warc_headers, http_headers_buff, payload.raw_stream)
예제 #17
0
    def load_resource(self, cdx, params):
        if cdx.get('filename') and cdx.get('offset') is not None:
            return None

        load_url = cdx.get('load_url')
        if not load_url:
            return None

        if params.get('content_type') == VideoLoader.CONTENT_TYPE:
            return None

        if self.forward_proxy_prefix and not cdx.get('is_live'):
            load_url = self.forward_proxy_prefix + load_url

        input_req = params['_input_req']

        req_headers = input_req.get_req_headers()

        dt = timestamp_to_datetime(cdx['timestamp'])

        if cdx.get('memento_url'):
            req_headers['Accept-Datetime'] = datetime_to_http_date(dt)

        method = input_req.get_req_method()
        data = input_req.get_req_body()

        p = PreparedRequest()
        try:
            p.prepare_url(load_url, None)
        except Exception:
            raise LiveResourceException(load_url)
        p.prepare_headers(None)
        p.prepare_auth(None, load_url)

        auth = p.headers.get('Authorization')
        if auth:
            req_headers['Authorization'] = auth

        load_url = p.url

        # host is set to the actual host for live loading
        # ensure it is set to the load_url host
        if not cdx.get('is_live'):
            #req_headers.pop('Host', '')
            req_headers['Host'] = urlsplit(p.url).netloc

            referrer = cdx.get('set_referrer')
            if referrer:
                req_headers['Referer'] = referrer

        upstream_res = self._do_request_with_redir_check(
            method, load_url, data, req_headers, params, cdx)

        memento_dt = upstream_res.headers.get('Memento-Datetime')
        if memento_dt:
            dt = http_date_to_datetime(memento_dt)
            cdx['timestamp'] = datetime_to_timestamp(dt)
        elif cdx.get('memento_url'):
            # if 'memento_url' set and no Memento-Datetime header present
            # then its an error
            no_except_close(upstream_res)
            return None

        agg_type = upstream_res.headers.get('Warcserver-Type')
        if agg_type == 'warc':
            cdx['source'] = unquote(
                upstream_res.headers.get('Warcserver-Source-Coll'))
            return None, upstream_res.headers, upstream_res

        if upstream_res.version == 11:
            version = '1.1'
        else:
            version = '1.0'

        status = 'HTTP/{version} {status} {reason}\r\n'
        status = status.format(version=version,
                               status=upstream_res.status,
                               reason=upstream_res.reason)

        http_headers_buff = status

        orig_resp = upstream_res._original_response

        try:  #pragma: no cover
            #PY 3
            resp_headers = orig_resp.headers._headers
            for n, v in resp_headers:
                nl = n.lower()
                if nl in self.SKIP_HEADERS:
                    continue

                if nl in self.UNREWRITE_HEADERS:
                    v = self.unrewrite_header(cdx, v)

                http_headers_buff += n + ': ' + v + '\r\n'

            http_headers_buff += '\r\n'

            try:
                # http headers could be encoded as utf-8 (though non-standard)
                # first try utf-8 encoding
                http_headers_buff = http_headers_buff.encode('utf-8')
            except:
                # then, fall back to latin-1
                http_headers_buff = http_headers_buff.encode('latin-1')

        except:  #pragma: no cover
            #PY 2
            resp_headers = orig_resp.msg.headers

            for line in resp_headers:
                n, v = line.split(':', 1)
                n = n.lower()
                v = v.strip()

                if n in self.SKIP_HEADERS:
                    continue

                new_v = v
                if n in self.UNREWRITE_HEADERS:
                    new_v = self.unrewrite_header(cdx, v)

                if new_v != v:
                    http_headers_buff += n + ': ' + new_v + '\r\n'
                else:
                    http_headers_buff += line

            # if python2, already byte headers, so leave as is
            http_headers_buff += '\r\n'

        try:
            fp = upstream_res._fp.fp
            if hasattr(fp, 'raw'):  #pragma: no cover
                fp = fp.raw
            remote_ip = fp._sock.getpeername()[0]
        except:  #pragma: no cover
            remote_ip = None

        warc_headers = {}

        warc_headers['WARC-Type'] = 'response'
        warc_headers['WARC-Record-ID'] = self._make_warc_id()
        warc_headers['WARC-Target-URI'] = cdx['url']
        warc_headers['WARC-Date'] = datetime_to_iso_date(dt)

        if not cdx.get('is_live'):
            now = datetime.datetime.utcnow()
            warc_headers['WARC-Source-URI'] = cdx.get('load_url')
            warc_headers['WARC-Creation-Date'] = datetime_to_iso_date(now)

        if remote_ip:
            warc_headers['WARC-IP-Address'] = remote_ip

        ct = upstream_res.headers.get('Content-Type')
        if ct:
            metadata = self.get_custom_metadata(ct, dt)
            if metadata:
                warc_headers['WARC-JSON-Metadata'] = json.dumps(metadata)

        warc_headers['Content-Type'] = 'application/http; msgtype=response'

        if method == 'HEAD':
            content_len = 0
        else:
            content_len = upstream_res.headers.get('Content-Length', -1)

        self._set_content_len(content_len, warc_headers,
                              len(http_headers_buff))

        warc_headers = StatusAndHeaders('WARC/1.0', warc_headers.items())
        return (warc_headers, http_headers_buff, upstream_res)
예제 #18
0
    def handle_call(self, environ, start_response):
        input_req = DirectWSGIInputRequest(environ)

        params = self._get_params(environ)

        request_uri = input_req.get_full_request_uri()

        input_buff = input_req.get_req_body()

        headers = input_req.get_req_headers()

        method = input_req.get_req_method()

        path = environ['PATH_INFO']

        # write request body as metadata/resource
        put_record = params.get('put_record')
        if put_record and method in ('PUT', 'POST'):
            return self._put_record(request_uri, input_buff, put_record,
                                    headers, params, start_response)

        skipping = any(
            x.skip_request(path, headers) for x in self.skip_filters)

        req_is_wrapped = False

        if not skipping:
            req_stream = ReqWrapper(input_buff, headers, params,
                                    self.create_buff_func)
            req_is_wrapped = True
        else:
            req_stream = input_buff

        data = None
        if input_buff:
            data = req_stream

        try:
            res = requests.request(url=self.upstream_host + request_uri,
                                   method=method,
                                   data=data,
                                   headers=headers,
                                   allow_redirects=False,
                                   stream=True)
            res.raise_for_status()
        except Exception as e:
            if req_is_wrapped:
                no_except_close(req_stream.out)
            return self.send_error(e, start_response)

        if not skipping:
            skipping = any(
                x.skip_response(path, req_stream.headers, res.headers, params)
                for x in self.skip_filters)

        if not skipping:
            resp_stream = RespWrapper(res.raw, res.headers, req_stream, params,
                                      self.write_queue, path,
                                      self.create_buff_func)

        else:
            resp_stream = res.raw
            if req_is_wrapped:
                no_except_close(req_stream.out)

        resp_iter = StreamIter(resp_stream)

        # ensure TE header from upstream is not included,
        # added automatically by wsgi app
        res.headers.pop('Transfer-Encoding', '')

        start_response('200 OK', list(res.headers.items()))
        return resp_iter
예제 #19
0
    def compute_page_range(self, reader, query):
        pagesize = query.page_size
        if not pagesize:
            pagesize = self.max_blocks
        else:
            try:
                pagesize = int(pagesize)
            except ValueError:
                msg = 'Invalid value for pageSize= param: {}'
                raise CDXException(msg.format(pagesize))

        last_line = None

        # Get End
        end_iter = search(reader, query.end_key, prev_size=1)

        try:
            end_line = six.next(end_iter)
        except StopIteration:
            last_line = read_last_line(reader)
            end_line = last_line

        # Get Start
        first_iter = iter_range(reader, query.key, query.end_key, prev_size=1)

        try:
            first_line = six.next(first_iter)
        except StopIteration:
            if end_line == last_line and query.key >= last_line:
                first_line = last_line
            else:
                no_except_close(reader)
                if query.page_count:
                    yield self._page_info(0, pagesize, 0)
                return

        first = IDXObject(first_line)

        end = IDXObject(end_line)

        try:
            blocks = end['lineno'] - first['lineno']
            total_pages = int(blocks / pagesize) + 1
        except:
            blocks = -1
            total_pages = 1

        if query.page_count:
            # same line, so actually need to look at cdx
            # to determine if it exists
            if blocks == 0:
                try:
                    block_cdx_iter = self.idx_to_cdx([first_line], query)
                    block = six.next(block_cdx_iter)
                    cdx = six.next(block)
                except StopIteration:
                    total_pages = 0
                    blocks = -1

            yield self._page_info(total_pages, pagesize, blocks + 1)
            no_except_close(reader)
            return

        curr_page = query.page
        if curr_page >= total_pages or curr_page < 0:
            msg = 'Page {0} invalid: First Page is 0, Last Page is {1}'
            no_except_close(reader)
            raise CDXException(msg.format(curr_page, total_pages - 1))

        startline = curr_page * pagesize
        endline = startline + pagesize - 1
        if blocks >= 0:
            endline = min(endline, blocks)

        if curr_page == 0:
            yield first_line
        else:
            startline -= 1

        try:
            idxiter = itertools.islice(first_iter, startline, endline)
            for idx in idxiter:
                yield idx
        except Exception:
            pass
        finally:
            no_except_close(reader)
예제 #20
0
    def render_content(self, wb_url, kwargs, environ):
        wb_url = wb_url.replace('#', '%23')
        wb_url = WbUrl(wb_url)

        history_page = environ.pop('HTTP_X_WOMBAT_HISTORY_PAGE', '')
        if history_page:
            wb_url.url = history_page
            is_ajax = True
        else:
            is_ajax = self.is_ajax(environ)

        is_timegate = self._check_accept_dt(wb_url, environ)

        self.prepare_env(environ)

        host_prefix = environ['pywb.host_prefix']
        rel_prefix = self.get_rel_prefix(environ)
        full_prefix = host_prefix + rel_prefix

        pywb_static_prefix = environ['pywb.static_prefix'] + '/'
        is_proxy = ('wsgiprox.proxy_host' in environ)

        # if OPTIONS in proxy mode, just generate the proxy responss
        if is_proxy and self.is_preflight(environ):
            return WbResponse.options_response(environ)

        if self.use_js_obj_proxy:
            content_rw = self.js_proxy_rw
        else:
            content_rw = self.default_rw

        # no redirects if in proxy
        redirect_to_exact = self.redirect_to_exact and not is_proxy

        # Check Prefer
        pref_mod, pref_applied = self._get_prefer_mod(wb_url, environ,
                                                      content_rw, is_proxy)

        response = None
        keep_frame_response = False

        # prefer overrides custom response?
        if pref_mod is not None:
            # fast-redirect to preferred
            if redirect_to_exact and not is_timegate and pref_mod != wb_url.mod:
                new_url = full_prefix + wb_url.to_str(mod=pref_mod)
                headers = [('Preference-Applied', pref_applied),
                           ('Vary', 'Prefer')]

                return WbResponse.redir_response(new_url,
                                                 '307 Temporary Redirect',
                                                 headers=headers)
            else:
                wb_url.mod = pref_mod
        else:
            if kwargs.get('output'):
                response = self.handle_timemap(wb_url, kwargs, full_prefix)

            elif wb_url.is_query():
                response = self.handle_query(environ, wb_url, kwargs,
                                             full_prefix)

            else:
                response = self.handle_custom_response(environ, wb_url,
                                                       full_prefix,
                                                       host_prefix, kwargs)

                keep_frame_response = (not kwargs.get('no_timegate_check')
                                       and is_timegate
                                       and not is_proxy) or redirect_to_exact

        if response and not keep_frame_response:
            return self.format_response(response, wb_url, full_prefix,
                                        is_timegate, is_proxy)

        if is_proxy:
            environ['pywb_proxy_magic'] = environ['wsgiprox.proxy_host']
            urlrewriter = IdentityUrlRewriter(wb_url, '')
            framed_replay = False

        else:
            urlrewriter = UrlRewriter(wb_url,
                                      prefix=full_prefix,
                                      full_prefix=full_prefix,
                                      rel_prefix=rel_prefix,
                                      pywb_static_prefix=pywb_static_prefix)

            framed_replay = self.framed_replay

        url_parts = urlsplit(wb_url.url)
        if not url_parts.path:
            return self.send_redirect('/', url_parts, urlrewriter)

        self.unrewrite_referrer(environ, full_prefix)

        urlkey = canonicalize(wb_url.url)

        inputreq = RewriteInputRequest(environ, urlkey, wb_url.url, content_rw)

        inputreq.include_method_query(wb_url.url)

        range_start, range_end, skip_record = self._check_range(
            inputreq, wb_url)

        setcookie_headers = None
        cookie_key = None
        if self.cookie_tracker:
            cookie_key = self.get_cookie_key(kwargs)
            if cookie_key:
                res = self.cookie_tracker.get_cookie_headers(
                    wb_url.url, urlrewriter, cookie_key,
                    environ.get('HTTP_COOKIE', ''))
                inputreq.extra_cookie, setcookie_headers = res

        r = self._do_req(inputreq, wb_url, kwargs, skip_record)

        if r.status_code >= 400:
            error = None
            try:
                error = r.raw.read()
            except Exception:
                pass
            finally:
                no_except_close(r.raw)

            if error:
                error = error.decode('utf-8')
            else:
                error = ''

            details = dict(args=kwargs, error=error)
            if r.status_code == 404:
                raise NotFoundException(url=wb_url.url, msg=details)

            else:
                raise UpstreamException(r.status_code,
                                        url=wb_url.url,
                                        details=details)

        cdx = CDXObject(r.headers.get('Warcserver-Cdx').encode('utf-8'))

        cdx_url_parts = urlsplit(cdx['url'])

        if cdx_url_parts.path.endswith(
                '/') and not url_parts.path.endswith('/'):
            # add trailing slash
            new_path = url_parts.path + '/'

            no_except_close(r.raw)

            return self.send_redirect(new_path, url_parts, urlrewriter)

        # only redirect to exact if not live, otherwise set to false
        redirect_to_exact = redirect_to_exact and not cdx.get('is_live')

        # return top-frame timegate response, with timestamp from cdx
        if response and keep_frame_response and (not redirect_to_exact
                                                 or not is_timegate):
            no_except_close(r.raw)
            return self.format_response(response, wb_url, full_prefix,
                                        is_timegate, is_proxy,
                                        cdx['timestamp'])

        stream = BufferedReader(r.raw, block_size=BUFF_SIZE)
        record = self.loader.parse_record_stream(stream,
                                                 ensure_http_headers=True)

        memento_dt = r.headers.get('Memento-Datetime')
        target_uri = r.headers.get('WARC-Target-URI')

        # cdx['urlkey'] = urlkey
        # cdx['timestamp'] = http_date_to_timestamp(memento_dt)
        # cdx['url'] = target_uri

        set_content_loc = False

        # Check if Fuzzy Match
        if target_uri != wb_url.url and cdx.get('is_fuzzy') == '1':
            set_content_loc = True

        # if redirect to exact timestamp (only set if not live)
        if redirect_to_exact:
            if set_content_loc or is_timegate or wb_url.timestamp != cdx.get(
                    'timestamp'):
                new_url = urlrewriter.get_new_url(url=target_uri,
                                                  timestamp=cdx['timestamp'],
                                                  mod=wb_url.mod)

                resp = WbResponse.redir_response(new_url,
                                                 '307 Temporary Redirect')
                if self.enable_memento:
                    if is_timegate and not is_proxy:
                        self._add_memento_links(target_uri,
                                                full_prefix,
                                                memento_dt,
                                                cdx['timestamp'],
                                                resp.status_headers,
                                                is_timegate,
                                                is_proxy,
                                                pref_applied=pref_applied,
                                                mod=pref_mod,
                                                is_memento=False)

                    else:
                        resp.status_headers['Link'] = MementoUtils.make_link(
                            target_uri, 'original')

                return resp

        self._add_custom_params(cdx, r.headers, kwargs, record)

        if self._add_range(record, wb_url, range_start, range_end):
            wb_url.mod = 'id_'

        if is_ajax:
            head_insert_func = None
            urlrewriter.rewrite_opts['is_ajax'] = True
        else:
            top_url = self.get_top_url(full_prefix, wb_url, cdx, kwargs)
            head_insert_func = (self.head_insert_view.create_insert_func(
                wb_url,
                full_prefix,
                host_prefix,
                top_url,
                environ,
                framed_replay,
                coll=kwargs.get('coll', ''),
                replay_mod=self.replay_mod,
                metadata=kwargs.get('metadata', {}),
                config=self.config))

        cookie_rewriter = None
        if self.cookie_tracker and cookie_key:
            # skip add cookie if service worker is not 200
            # it seems cookie headers from service workers are not applied, so don't update in cache
            if wb_url.mod == 'sw_':
                cookie_key = None

            cookie_rewriter = self.cookie_tracker.get_rewriter(
                urlrewriter, cookie_key)

        urlrewriter.rewrite_opts['ua_string'] = environ.get('HTTP_USER_AGENT')

        result = content_rw(record, urlrewriter, cookie_rewriter,
                            head_insert_func, cdx, environ)

        status_headers, gen, is_rw = result

        if history_page:
            title = DefaultRewriter._extract_title(gen)
            if not title:
                title = unquote(environ.get('HTTP_X_WOMBAT_HISTORY_TITLE', ''))

            if not title:
                title = history_page

            self._add_history_page(cdx, kwargs, title)
            return WbResponse.json_response({'title': title})

        if setcookie_headers:
            status_headers.headers.extend(setcookie_headers)

        if ' ' not in status_headers.statusline:
            status_headers.statusline += ' None'

        if not is_ajax and self.enable_memento:
            self._add_memento_links(cdx['url'],
                                    full_prefix,
                                    memento_dt,
                                    cdx['timestamp'],
                                    status_headers,
                                    is_timegate,
                                    is_proxy,
                                    cdx.get('source-coll'),
                                    mod=pref_mod,
                                    pref_applied=pref_applied)

            set_content_loc = True

        if set_content_loc and not redirect_to_exact and not is_proxy:
            status_headers.headers.append(
                ('Content-Location',
                 urlrewriter.get_new_url(timestamp=cdx['timestamp'],
                                         url=cdx['url'])))

        if not is_proxy:
            self.add_csp_header(wb_url, status_headers)

        response = WbResponse(status_headers, gen)

        if is_proxy and environ.get('HTTP_ORIGIN'):
            response.add_access_control_headers(environ)

        if r.status_code == 200 and kwargs.get(
                'cache') == 'always' and environ.get('HTTP_REFERER'):
            response.status_headers[
                'Cache-Control'] = 'public, max-age=31536000, immutable'

        return response
예제 #21
0
 def iter_blocks(reader):
     try:
         for r in ranges:
             yield decompress_block(r)
     finally:
         no_except_close(reader)