def test_limit_post(): reader = LimitReader(BytesIO(b'abcdefg'), 3) r = requests.request(method='POST', url='http://httpbin.org/post', data=reader, headers={'Content-Length': '3'}) assert '"abc"' in r.text
def replay_capture(self, wbrequest, cdx, cdx_loader, failed_files): (status_headers, stream) = (self.content_loader. resolve_headers_and_payload(cdx, failed_files, cdx_loader)) # check and reject self-redirect self._reject_self_redirect(wbrequest, cdx, status_headers) # check if redir is needed redir_response = self._redirect_if_needed(wbrequest, cdx) if redir_response: return redir_response length = status_headers.get_header('content-length') stream = LimitReader.wrap_stream(stream, length) # one more check for referrer-based self-redirect # TODO: evaluate this, as refreshing in browser may sometimes cause # referrer to be set to the same page, incorrectly skipping a capture # self._reject_referrer_self_redirect(wbrequest) urlrewriter = wbrequest.urlrewriter # if using url rewriter, use original url for rewriting purposes if wbrequest and wbrequest.wb_url: wbrequest.wb_url.url = cdx['original'] head_insert_func = None if self.head_insert_view: head_insert_func = (self.head_insert_view. create_insert_func(wbrequest)) result = (self.content_rewriter. rewrite_content(urlrewriter, headers=status_headers, stream=stream, head_insert_func=head_insert_func, urlkey=cdx['urlkey'], cdx=cdx)) (status_headers, response_iter, is_rewritten) = result # buffer response if buffering enabled if self.buffer_response: response_iter = self.buffered_response(status_headers, response_iter) response = self.response_class(status_headers, response_iter, wbrequest=wbrequest, cdx=cdx) # notify reporter callback, if any if self._reporter: self._reporter(wbrequest, cdx, response) return response
def replay_capture(self, wbrequest, cdx, cdx_loader, failed_files): (status_headers, stream) = (self.content_loader.resolve_headers_and_payload( cdx, failed_files, cdx_loader)) # check and reject self-redirect self._reject_self_redirect(wbrequest, cdx, status_headers) # check if redir is needed redir_response = self._redirect_if_needed(wbrequest, cdx) if redir_response: return redir_response length = status_headers.get_header('content-length') stream = LimitReader.wrap_stream(stream, length) # one more check for referrer-based self-redirect # TODO: evaluate this, as refreshing in browser may sometimes cause # referrer to be set to the same page, incorrectly skipping a capture # self._reject_referrer_self_redirect(wbrequest) urlrewriter = wbrequest.urlrewriter # if using url rewriter, use original url for rewriting purposes if wbrequest and wbrequest.wb_url: wbrequest.wb_url.url = cdx['original'] head_insert_func = None if self.head_insert_view: head_insert_func = ( self.head_insert_view.create_insert_func(wbrequest)) result = (self.content_rewriter.rewrite_content( urlrewriter, headers=status_headers, stream=stream, head_insert_func=head_insert_func, urlkey=cdx['urlkey'], cdx=cdx)) (status_headers, response_iter, is_rewritten) = result # buffer response if buffering enabled if self.buffer_response: response_iter = self.buffered_response(status_headers, response_iter) response = self.response_class(status_headers, response_iter, wbrequest=wbrequest, cdx=cdx) # notify reporter callback, if any if self._reporter: self._reporter(wbrequest, cdx, response) return response
def read_range(): with open(spec['name'], 'rb') as fh: fh.seek(start) fh = LimitReader.wrap_stream(fh, maxlen) while True: buf = fh.read() if not buf: break yield buf
def read_range(): with io.BytesIO( joined ) as fh: # Perma changes 2: replaced real file w/ BytesIO fh.seek(start) fh = LimitReader.wrap_stream(fh, maxlen) while True: buf = fh.read() if not buf: break yield buf
def fetch_http(self, url, urlkey=None, env=None, req_headers=None, follow_redirects=False, ignore_proxies=False, verify=True): method = 'GET' data = None proxies = None if not ignore_proxies: proxies = self.proxies if not req_headers: req_headers = {} if env is not None: method = env['REQUEST_METHOD'].upper() input_ = env['wsgi.input'] req_headers.update(self.translate_headers(url, urlkey, env)) if method in ('POST', 'PUT'): len_ = env.get('CONTENT_LENGTH') if len_: data = LimitReader(input_, int(len_)) else: data = input_ response = requests.request(method=method, url=url, data=data, headers=req_headers, allow_redirects=follow_redirects, proxies=proxies, stream=True, verify=verify) statusline = str(response.status_code) + ' ' + response.reason headers = response.headers.items() stream = response.raw status_headers = StatusAndHeaders(statusline, headers) return (status_headers, stream)
def do_upload(self, filename, stream, user, coll, rec, offset, length): stream.seek(offset) logger.debug('do_upload(): {0} offset: {1}: len: {2}'.format(rec, offset, length)) stream = LimitReader(stream, length) headers = {'Content-Length': str(length)} upload_url = self.upload_path.format(record_host=self.record_host, user=user, coll=coll, rec=rec) r = requests.put(upload_url, headers=headers, data=stream)
def load(self, url, offset=0, length=-1): # first try to fetch url contents from cache cache_key = 'warc-'+re.sub('[^\w-]', '', url) file_contents = django_cache.get(cache_key) if not file_contents: # url wasn't in cache -- fetch entire contents of url from super() and put in cache file_contents = super(CachedLoader, self).load(url).read() django_cache.set(cache_key, file_contents, timeout=60) # use a short timeout so large warcs don't evict everything else in the cache # turn string contents of url into file-like object afile = StringIO.StringIO(file_contents) # --- from here down is taken from super() --- if offset > 0: afile.seek(offset) if length >= 0: return LimitReader(afile, length) else: return afile
def parse_record_stream(self, stream, statusline=None, known_format=None, no_record_parse=False): """ Parse file-like stream and return an ArcWarcRecord encapsulating the record headers, http headers (if any), and a stream limited to the remainder of the record. Pass statusline and known_format to detect_type_loader_headers() to faciliate parsing. """ (the_format, rec_headers) = (self. _detect_type_load_headers(stream, statusline, known_format)) if the_format == 'arc': uri = rec_headers.get_header('uri') length = rec_headers.get_header('length') content_type = rec_headers.get_header('content-type') sub_len = rec_headers.total_len if uri and uri.startswith('filedesc://'): rec_type = 'arc_header' else: rec_type = 'response' elif the_format in ('warc', 'arc2warc'): rec_type = rec_headers.get_header('WARC-Type') uri = rec_headers.get_header('WARC-Target-URI') length = rec_headers.get_header('Content-Length') content_type = rec_headers.get_header('Content-Type') if the_format == 'warc': sub_len = 0 else: sub_len = rec_headers.total_len the_format = 'warc' is_err = False try: if length is not None: length = int(length) - sub_len if length < 0: is_err = True except (ValueError, TypeError): is_err = True # err condition if is_err: length = 0 # limit stream to the length for all valid records if length is not None and length >= 0: stream = LimitReader.wrap_stream(stream, length) # don't parse the http record at all if no_record_parse: status_headers = None#StatusAndHeaders('', []) # if empty record (error or otherwise) set status to 204 elif length == 0: if is_err: msg = '204 Possible Error' else: msg = '204 No Content' status_headers = StatusAndHeaders(msg, []) # response record or non-empty revisit: parse HTTP status and headers! elif (rec_type in ('response', 'revisit') and uri.startswith(self.HTTP_SCHEMES)): status_headers = self.http_parser.parse(stream) # request record: parse request elif ((rec_type == 'request') and uri.startswith(self.HTTP_SCHEMES)): status_headers = self.http_req_parser.parse(stream) # everything else: create a no-status entry, set content-type else: content_type_header = [('Content-Type', content_type)] if length is not None and length >= 0: content_type_header.append(('Content-Length', str(length))) status_headers = StatusAndHeaders('200 OK', content_type_header) return ArcWarcRecord(the_format, rec_type, rec_headers, stream, status_headers, content_type, length)
def load(self, url, offset=0, length=-1): # first try to fetch url contents from cache cache_key = 'warc-' + re.sub('[^\w-]', '', url) mirror_name_cache_key = cache_key + '-mirror-name' mirror_name = '' file_contents = django_cache.get(cache_key) if file_contents is None: # url wasn't in cache -- load contents # try fetching from each mirror in the LOCKSS network, in random order if settings.USE_LOCKSS_REPLAY: mirrors = Mirror.get_cached_mirrors() random.shuffle(mirrors) for mirror in mirrors: lockss_key = url.replace('file://', '').replace( WARC_STORAGE_PATH, 'https://' + settings.HOST + '/lockss/fetch') lockss_url = urljoin(mirror['content_url'], 'ServeContent') try: logging.info("Fetching from %s?url=%s" % (lockss_url, lockss_key)) response = requests.get(lockss_url, params={'url': lockss_key}) assert response.ok file_contents = response.content mirror_name = mirror['name'] logging.info("Got content from lockss") except (requests.ConnectionError, requests.Timeout, AssertionError) as e: logging.info("Couldn't get from lockss: %s" % e) # If url wasn't in LOCKSS yet or LOCKSS is disabled, fetch from local storage using super() if file_contents is None: file_contents = super(CachedLoader, self).load(url).read() logging.info("Got content from local disk") # cache file contents # use a short timeout so large warcs don't evict everything else in the cache django_cache.set(cache_key, file_contents, timeout=60) django_cache.set(mirror_name_cache_key, mirror_name, timeout=60) else: mirror_name = django_cache.get(mirror_name_cache_key) #logging.info("Got content from cache") # set wbrequest.mirror_name so it can be displayed in template later thread_local_data.wbrequest.mirror_name = mirror_name # turn string contents of url into file-like object afile = StringIO.StringIO(file_contents) # --- from here down is taken from super() --- if offset > 0: afile.seek(offset) if length >= 0: return LimitReader(afile, length) else: return afile
def parse_record_stream(self, stream, statusline=None, known_format=None, no_record_parse=False): """ Parse file-like stream and return an ArcWarcRecord encapsulating the record headers, http headers (if any), and a stream limited to the remainder of the record. Pass statusline and known_format to detect_type_loader_headers() to faciliate parsing. """ (the_format, rec_headers) = (self._detect_type_load_headers( stream, statusline, known_format)) if the_format == 'arc': uri = rec_headers.get_header('uri') length = rec_headers.get_header('length') content_type = rec_headers.get_header('content-type') sub_len = rec_headers.total_len if uri and uri.startswith('filedesc://'): rec_type = 'arc_header' else: rec_type = 'response' elif the_format == 'warc': rec_type = rec_headers.get_header('WARC-Type') uri = rec_headers.get_header('WARC-Target-URI') length = rec_headers.get_header('Content-Length') content_type = rec_headers.get_header('Content-Type') sub_len = 0 is_err = False try: if length is not None: length = int(length) - sub_len if length < 0: is_err = True except (ValueError, TypeError): is_err = True # err condition if is_err: length = 0 # limit stream to the length for all valid records if length is not None and length >= 0: stream = LimitReader.wrap_stream(stream, length) # don't parse the http record at all if no_record_parse: status_headers = None #StatusAndHeaders('', []) # if empty record (error or otherwise) set status to 204 elif length == 0: if is_err: msg = '204 Possible Error' else: msg = '204 No Content' status_headers = StatusAndHeaders(msg, []) # response record or non-empty revisit: parse HTTP status and headers! elif (rec_type in ('response', 'revisit') and uri.startswith(self.HTTP_SCHEMES)): status_headers = self.http_parser.parse(stream) # request record: parse request elif ((rec_type == 'request') and uri.startswith(self.HTTP_SCHEMES)): status_headers = self.http_req_parser.parse(stream) # everything else: create a no-status entry, set content-type else: content_type_header = [('Content-Type', content_type)] if length is not None and length >= 0: content_type_header.append(('Content-Length', str(length))) status_headers = StatusAndHeaders('200 OK', content_type_header) return ArcWarcRecord(the_format, rec_type, rec_headers, stream, status_headers, content_type, length)
def parse_record_stream(self, stream, statusline=None, known_format=None): """ Parse file-like stream and return an ArcWarcRecord encapsulating the record headers, http headers (if any), and a stream limited to the remainder of the record. Pass statusline and known_format to detect_type_loader_headers() to faciliate parsing. """ (the_format, rec_headers) = (self. _detect_type_load_headers(stream, statusline, known_format)) if the_format == 'arc': uri = rec_headers.get_header('uri') length = rec_headers.get_header('length') content_type = rec_headers.get_header('content-type') sub_len = rec_headers.total_len if uri and uri.startswith('filedesc://'): rec_type = 'arc_header' else: rec_type = 'response' elif the_format == 'warc': rec_type = rec_headers.get_header('WARC-Type') uri = rec_headers.get_header('WARC-Target-URI') length = rec_headers.get_header('Content-Length') content_type = rec_headers.get_header('Content-Type') sub_len = 0 is_err = False try: length = int(length) - sub_len if length < 0: is_err = True except ValueError: is_err = True # err condition if is_err: length = 0 # limit stream to the length for all valid records stream = LimitReader.wrap_stream(stream, length) # if empty record (error or otherwise) set status to - if length == 0: status_headers = StatusAndHeaders('- None', []) # response record or non-empty revisit: parse HTTP status and headers! elif (rec_type in ('response', 'revisit') and not uri.startswith(('dns:', 'whois:'))): status_headers = self.http_parser.parse(stream) # request record: parse request elif ((rec_type == 'request') and not uri.startswith(('dns:', 'whois:'))): status_headers = self.http_req_parser.parse(stream) # everything else: create a no-status entry, set content-type else: content_type_header = [('Content-Type', content_type)] status_headers = StatusAndHeaders('- OK', content_type_header) return ArcWarcRecord(the_format, rec_type, rec_headers, stream, status_headers, content_type, length)
def fetch_http(self, url, urlkey=None, env=None, req_headers=None, follow_redirects=False, skip_recording=False, verify=True): method = 'GET' data = None proxies = None if not skip_recording: proxies = self.proxies if not req_headers: req_headers = {} if env is not None: method = env['REQUEST_METHOD'].upper() input_ = env['wsgi.input'] req_headers.update(self.translate_headers(url, urlkey, env)) if method in ('POST', 'PUT'): len_ = env.get('CONTENT_LENGTH') if len_: data = LimitReader(input_, int(len_)) else: data = input_ response = self.live_request(method=method, url=url, data=data, headers=req_headers, allow_redirects=follow_redirects, proxies=proxies, stream=True, verify=verify) statusline = str(response.status_code) + ' ' + response.reason headers = response.headers.items() stream = response.raw try: #pragma: no cover #PY 3 headers = stream._original_response.headers._headers except: #pragma: no cover #PY 2 headers = [] resp_headers = stream._original_response.msg.headers for h in resp_headers: n, v = h.split(':', 1) n = n.strip() v = v.strip() headers.append((n, v)) status_headers = StatusAndHeaders(statusline, headers) return (status_headers, stream)