Пример #1
0
def test_limit_post():
    reader = LimitReader(BytesIO(b'abcdefg'), 3)
    r = requests.request(method='POST',
                         url='http://httpbin.org/post',
                         data=reader,
                         headers={'Content-Length': '3'})

    assert '"abc"' in r.text
Пример #2
0
    def replay_capture(self, wbrequest, cdx, cdx_loader, failed_files):
        (status_headers, stream) = (self.content_loader.
                                    resolve_headers_and_payload(cdx,
                                                                failed_files,
                                                                cdx_loader))

        # check and reject self-redirect
        self._reject_self_redirect(wbrequest, cdx, status_headers)

        # check if redir is needed
        redir_response = self._redirect_if_needed(wbrequest, cdx)
        if redir_response:
            return redir_response

        length = status_headers.get_header('content-length')
        stream = LimitReader.wrap_stream(stream, length)

        # one more check for referrer-based self-redirect
        # TODO: evaluate this, as refreshing in browser may sometimes cause
        # referrer to be set to the same page, incorrectly skipping a capture
        # self._reject_referrer_self_redirect(wbrequest)

        urlrewriter = wbrequest.urlrewriter

        # if using url rewriter, use original url for rewriting purposes
        if wbrequest and wbrequest.wb_url:
            wbrequest.wb_url.url = cdx['original']

        head_insert_func = None
        if self.head_insert_view:
            head_insert_func = (self.head_insert_view.
                                create_insert_func(wbrequest))

        result = (self.content_rewriter.
                  rewrite_content(urlrewriter,
                                  headers=status_headers,
                                  stream=stream,
                                  head_insert_func=head_insert_func,
                                  urlkey=cdx['urlkey'],
                                  cdx=cdx))

        (status_headers, response_iter, is_rewritten) = result

        # buffer response if buffering enabled
        if self.buffer_response:
            response_iter = self.buffered_response(status_headers,
                                                   response_iter)

        response = self.response_class(status_headers,
                                       response_iter,
                                       wbrequest=wbrequest,
                                       cdx=cdx)

        # notify reporter callback, if any
        if self._reporter:
            self._reporter(wbrequest, cdx, response)

        return response
Пример #3
0
    def replay_capture(self, wbrequest, cdx, cdx_loader, failed_files):
        (status_headers,
         stream) = (self.content_loader.resolve_headers_and_payload(
             cdx, failed_files, cdx_loader))

        # check and reject self-redirect
        self._reject_self_redirect(wbrequest, cdx, status_headers)

        # check if redir is needed
        redir_response = self._redirect_if_needed(wbrequest, cdx)
        if redir_response:
            return redir_response

        length = status_headers.get_header('content-length')
        stream = LimitReader.wrap_stream(stream, length)

        # one more check for referrer-based self-redirect
        # TODO: evaluate this, as refreshing in browser may sometimes cause
        # referrer to be set to the same page, incorrectly skipping a capture
        # self._reject_referrer_self_redirect(wbrequest)

        urlrewriter = wbrequest.urlrewriter

        # if using url rewriter, use original url for rewriting purposes
        if wbrequest and wbrequest.wb_url:
            wbrequest.wb_url.url = cdx['original']

        head_insert_func = None
        if self.head_insert_view:
            head_insert_func = (
                self.head_insert_view.create_insert_func(wbrequest))

        result = (self.content_rewriter.rewrite_content(
            urlrewriter,
            headers=status_headers,
            stream=stream,
            head_insert_func=head_insert_func,
            urlkey=cdx['urlkey'],
            cdx=cdx))

        (status_headers, response_iter, is_rewritten) = result

        # buffer response if buffering enabled
        if self.buffer_response:
            response_iter = self.buffered_response(status_headers,
                                                   response_iter)

        response = self.response_class(status_headers,
                                       response_iter,
                                       wbrequest=wbrequest,
                                       cdx=cdx)

        # notify reporter callback, if any
        if self._reporter:
            self._reporter(wbrequest, cdx, response)

        return response
Пример #4
0
        def read_range():
            with open(spec['name'], 'rb') as fh:
                fh.seek(start)
                fh = LimitReader.wrap_stream(fh, maxlen)
                while True:
                    buf = fh.read()
                    if not buf:
                        break

                    yield buf
Пример #5
0
        def read_range():
            with open(spec['name'], 'rb') as fh:
                fh.seek(start)
                fh = LimitReader.wrap_stream(fh, maxlen)
                while True:
                    buf = fh.read()
                    if not buf:
                        break

                    yield buf
Пример #6
0
    def read_range():
        with io.BytesIO(
                joined
        ) as fh:  # Perma changes 2: replaced real file w/ BytesIO
            fh.seek(start)
            fh = LimitReader.wrap_stream(fh, maxlen)
            while True:
                buf = fh.read()
                if not buf:
                    break

                yield buf
Пример #7
0
    def fetch_http(self,
                   url,
                   urlkey=None,
                   env=None,
                   req_headers=None,
                   follow_redirects=False,
                   ignore_proxies=False,
                   verify=True):

        method = 'GET'
        data = None

        proxies = None
        if not ignore_proxies:
            proxies = self.proxies

        if not req_headers:
            req_headers = {}

        if env is not None:
            method = env['REQUEST_METHOD'].upper()
            input_ = env['wsgi.input']

            req_headers.update(self.translate_headers(url, urlkey, env))

            if method in ('POST', 'PUT'):
                len_ = env.get('CONTENT_LENGTH')
                if len_:
                    data = LimitReader(input_, int(len_))
                else:
                    data = input_

        response = requests.request(method=method,
                                    url=url,
                                    data=data,
                                    headers=req_headers,
                                    allow_redirects=follow_redirects,
                                    proxies=proxies,
                                    stream=True,
                                    verify=verify)

        statusline = str(response.status_code) + ' ' + response.reason

        headers = response.headers.items()
        stream = response.raw

        status_headers = StatusAndHeaders(statusline, headers)

        return (status_headers, stream)
Пример #8
0
    def do_upload(self, filename, stream, user, coll, rec, offset, length):
        stream.seek(offset)

        logger.debug('do_upload(): {0} offset: {1}: len: {2}'.format(rec, offset, length))

        stream = LimitReader(stream, length)
        headers = {'Content-Length': str(length)}

        upload_url = self.upload_path.format(record_host=self.record_host,
                                             user=user,
                                             coll=coll,
                                             rec=rec)

        r = requests.put(upload_url,
                         headers=headers,
                         data=stream)
Пример #9
0
    def load(self, url, offset=0, length=-1):
        # first try to fetch url contents from cache
        cache_key = 'warc-'+re.sub('[^\w-]', '', url)
        file_contents = django_cache.get(cache_key)
        if not file_contents:
            # url wasn't in cache -- fetch entire contents of url from super() and put in cache
            file_contents = super(CachedLoader, self).load(url).read()
            django_cache.set(cache_key, file_contents, timeout=60)  # use a short timeout so large warcs don't evict everything else in the cache

        # turn string contents of url into file-like object
        afile = StringIO.StringIO(file_contents)

        # --- from here down is taken from super() ---
        if offset > 0:
            afile.seek(offset)

        if length >= 0:
            return LimitReader(afile, length)
        else:
            return afile
Пример #10
0
    def parse_record_stream(self, stream,
                            statusline=None,
                            known_format=None,
                            no_record_parse=False):
        """ Parse file-like stream and return an ArcWarcRecord
        encapsulating the record headers, http headers (if any),
        and a stream limited to the remainder of the record.

        Pass statusline and known_format to detect_type_loader_headers()
        to faciliate parsing.
        """
        (the_format, rec_headers) = (self.
                                     _detect_type_load_headers(stream,
                                                               statusline,
                                                               known_format))

        if the_format == 'arc':
            uri = rec_headers.get_header('uri')
            length = rec_headers.get_header('length')
            content_type = rec_headers.get_header('content-type')
            sub_len = rec_headers.total_len
            if uri and uri.startswith('filedesc://'):
                rec_type = 'arc_header'
            else:
                rec_type = 'response'

        elif the_format in ('warc', 'arc2warc'):
            rec_type = rec_headers.get_header('WARC-Type')
            uri = rec_headers.get_header('WARC-Target-URI')
            length = rec_headers.get_header('Content-Length')
            content_type = rec_headers.get_header('Content-Type')
            if the_format == 'warc':
                sub_len = 0
            else:
                sub_len = rec_headers.total_len
                the_format = 'warc'

        is_err = False

        try:
            if length is not None:
                length = int(length) - sub_len
                if length < 0:
                    is_err = True

        except (ValueError, TypeError):
            is_err = True

        # err condition
        if is_err:
            length = 0

        # limit stream to the length for all valid records
        if length is not None and length >= 0:
            stream = LimitReader.wrap_stream(stream, length)

        # don't parse the http record at all
        if no_record_parse:
            status_headers = None#StatusAndHeaders('', [])

        # if empty record (error or otherwise) set status to 204
        elif length == 0:
            if is_err:
                msg = '204 Possible Error'
            else:
                msg = '204 No Content'

            status_headers = StatusAndHeaders(msg, [])

        # response record or non-empty revisit: parse HTTP status and headers!
        elif (rec_type in ('response', 'revisit')
              and uri.startswith(self.HTTP_SCHEMES)):
            status_headers = self.http_parser.parse(stream)

        # request record: parse request
        elif ((rec_type == 'request')
              and uri.startswith(self.HTTP_SCHEMES)):
            status_headers = self.http_req_parser.parse(stream)

        # everything else: create a no-status entry, set content-type
        else:
            content_type_header = [('Content-Type', content_type)]

            if length is not None and length >= 0:
                content_type_header.append(('Content-Length', str(length)))

            status_headers = StatusAndHeaders('200 OK', content_type_header)

        return ArcWarcRecord(the_format, rec_type,
                             rec_headers, stream, status_headers,
                             content_type, length)
Пример #11
0
    def load(self, url, offset=0, length=-1):

        # first try to fetch url contents from cache
        cache_key = 'warc-' + re.sub('[^\w-]', '', url)
        mirror_name_cache_key = cache_key + '-mirror-name'
        mirror_name = ''

        file_contents = django_cache.get(cache_key)
        if file_contents is None:
            # url wasn't in cache -- load contents

            # try fetching from each mirror in the LOCKSS network, in random order
            if settings.USE_LOCKSS_REPLAY:
                mirrors = Mirror.get_cached_mirrors()
                random.shuffle(mirrors)

                for mirror in mirrors:
                    lockss_key = url.replace('file://', '').replace(
                        WARC_STORAGE_PATH,
                        'https://' + settings.HOST + '/lockss/fetch')
                    lockss_url = urljoin(mirror['content_url'], 'ServeContent')
                    try:
                        logging.info("Fetching from %s?url=%s" %
                                     (lockss_url, lockss_key))
                        response = requests.get(lockss_url,
                                                params={'url': lockss_key})
                        assert response.ok
                        file_contents = response.content
                        mirror_name = mirror['name']
                        logging.info("Got content from lockss")
                    except (requests.ConnectionError, requests.Timeout,
                            AssertionError) as e:
                        logging.info("Couldn't get from lockss: %s" % e)

            # If url wasn't in LOCKSS yet or LOCKSS is disabled, fetch from local storage using super()
            if file_contents is None:
                file_contents = super(CachedLoader, self).load(url).read()
                logging.info("Got content from local disk")

            # cache file contents
            # use a short timeout so large warcs don't evict everything else in the cache
            django_cache.set(cache_key, file_contents, timeout=60)
            django_cache.set(mirror_name_cache_key, mirror_name, timeout=60)

        else:
            mirror_name = django_cache.get(mirror_name_cache_key)
            #logging.info("Got content from cache")

        # set wbrequest.mirror_name so it can be displayed in template later
        thread_local_data.wbrequest.mirror_name = mirror_name

        # turn string contents of url into file-like object
        afile = StringIO.StringIO(file_contents)

        # --- from here down is taken from super() ---
        if offset > 0:
            afile.seek(offset)

        if length >= 0:
            return LimitReader(afile, length)
        else:
            return afile
Пример #12
0
    def parse_record_stream(self,
                            stream,
                            statusline=None,
                            known_format=None,
                            no_record_parse=False):
        """ Parse file-like stream and return an ArcWarcRecord
        encapsulating the record headers, http headers (if any),
        and a stream limited to the remainder of the record.

        Pass statusline and known_format to detect_type_loader_headers()
        to faciliate parsing.
        """
        (the_format, rec_headers) = (self._detect_type_load_headers(
            stream, statusline, known_format))

        if the_format == 'arc':
            uri = rec_headers.get_header('uri')
            length = rec_headers.get_header('length')
            content_type = rec_headers.get_header('content-type')
            sub_len = rec_headers.total_len
            if uri and uri.startswith('filedesc://'):
                rec_type = 'arc_header'
            else:
                rec_type = 'response'

        elif the_format == 'warc':
            rec_type = rec_headers.get_header('WARC-Type')
            uri = rec_headers.get_header('WARC-Target-URI')
            length = rec_headers.get_header('Content-Length')
            content_type = rec_headers.get_header('Content-Type')
            sub_len = 0

        is_err = False

        try:
            if length is not None:
                length = int(length) - sub_len
                if length < 0:
                    is_err = True

        except (ValueError, TypeError):
            is_err = True

        # err condition
        if is_err:
            length = 0

        # limit stream to the length for all valid records
        if length is not None and length >= 0:
            stream = LimitReader.wrap_stream(stream, length)

        # don't parse the http record at all
        if no_record_parse:
            status_headers = None  #StatusAndHeaders('', [])

        # if empty record (error or otherwise) set status to 204
        elif length == 0:
            if is_err:
                msg = '204 Possible Error'
            else:
                msg = '204 No Content'

            status_headers = StatusAndHeaders(msg, [])

        # response record or non-empty revisit: parse HTTP status and headers!
        elif (rec_type in ('response', 'revisit')
              and uri.startswith(self.HTTP_SCHEMES)):
            status_headers = self.http_parser.parse(stream)

        # request record: parse request
        elif ((rec_type == 'request') and uri.startswith(self.HTTP_SCHEMES)):
            status_headers = self.http_req_parser.parse(stream)

        # everything else: create a no-status entry, set content-type
        else:
            content_type_header = [('Content-Type', content_type)]

            if length is not None and length >= 0:
                content_type_header.append(('Content-Length', str(length)))

            status_headers = StatusAndHeaders('200 OK', content_type_header)

        return ArcWarcRecord(the_format, rec_type, rec_headers, stream,
                             status_headers, content_type, length)
Пример #13
0
    def parse_record_stream(self, stream,
                            statusline=None, known_format=None):
        """ Parse file-like stream and return an ArcWarcRecord
        encapsulating the record headers, http headers (if any),
        and a stream limited to the remainder of the record.

        Pass statusline and known_format to detect_type_loader_headers()
        to faciliate parsing.
        """
        (the_format, rec_headers) = (self.
                                     _detect_type_load_headers(stream,
                                                               statusline,
                                                               known_format))

        if the_format == 'arc':
            uri = rec_headers.get_header('uri')
            length = rec_headers.get_header('length')
            content_type = rec_headers.get_header('content-type')
            sub_len = rec_headers.total_len
            if uri and uri.startswith('filedesc://'):
                rec_type = 'arc_header'
            else:
                rec_type = 'response'

        elif the_format == 'warc':
            rec_type = rec_headers.get_header('WARC-Type')
            uri = rec_headers.get_header('WARC-Target-URI')
            length = rec_headers.get_header('Content-Length')
            content_type = rec_headers.get_header('Content-Type')
            sub_len = 0

        is_err = False

        try:
            length = int(length) - sub_len
            if length < 0:
                is_err = True
        except ValueError:
            is_err = True

        # err condition
        if is_err:
            length = 0
        # limit stream to the length for all valid records
        stream = LimitReader.wrap_stream(stream, length)

        # if empty record (error or otherwise) set status to -
        if length == 0:
            status_headers = StatusAndHeaders('- None', [])

        # response record or non-empty revisit: parse HTTP status and headers!
        elif (rec_type in ('response', 'revisit') and
              not uri.startswith(('dns:', 'whois:'))):
            status_headers = self.http_parser.parse(stream)

        # request record: parse request
        elif ((rec_type == 'request') and
              not uri.startswith(('dns:', 'whois:'))):
            status_headers = self.http_req_parser.parse(stream)

        # everything else: create a no-status entry, set content-type
        else:
            content_type_header = [('Content-Type', content_type)]
            status_headers = StatusAndHeaders('- OK', content_type_header)

        return ArcWarcRecord(the_format, rec_type,
                             rec_headers, stream, status_headers,
                             content_type, length)
Пример #14
0
    def fetch_http(self,
                   url,
                   urlkey=None,
                   env=None,
                   req_headers=None,
                   follow_redirects=False,
                   skip_recording=False,
                   verify=True):

        method = 'GET'
        data = None

        proxies = None
        if not skip_recording:
            proxies = self.proxies

        if not req_headers:
            req_headers = {}

        if env is not None:
            method = env['REQUEST_METHOD'].upper()
            input_ = env['wsgi.input']

            req_headers.update(self.translate_headers(url, urlkey, env))

            if method in ('POST', 'PUT'):
                len_ = env.get('CONTENT_LENGTH')
                if len_:
                    data = LimitReader(input_, int(len_))
                else:
                    data = input_

        response = self.live_request(method=method,
                                     url=url,
                                     data=data,
                                     headers=req_headers,
                                     allow_redirects=follow_redirects,
                                     proxies=proxies,
                                     stream=True,
                                     verify=verify)

        statusline = str(response.status_code) + ' ' + response.reason

        headers = response.headers.items()

        stream = response.raw

        try:  #pragma: no cover
            #PY 3
            headers = stream._original_response.headers._headers
        except:  #pragma: no cover
            #PY 2
            headers = []
            resp_headers = stream._original_response.msg.headers
            for h in resp_headers:
                n, v = h.split(':', 1)
                n = n.strip()
                v = v.strip()
                headers.append((n, v))

        status_headers = StatusAndHeaders(statusline, headers)

        return (status_headers, stream)