def test_err_compress_mix(): # error: compressed member, followed by not compressed -- considered invalid x = DecompressingBufferedReader(BytesIO(compress('ABC') + b'123'), decomp_type = 'gzip') b = x.read() assert b == b'ABC' x.read_next_member() assert x.read() == b''
def test_err_compress_mix(): # error: compressed member, followed by not compressed -- considered invalid x = DecompressingBufferedReader(BytesIO(compress('ABC') + b'123'), decomp_type = 'gzip') b = x.read() b = x.read_next_member() with pytest.raises(zlib.error): x.read()
def extract_text(entry): buff_reader = entry.buffer if not buff_reader: return b'' buff_reader.seek(0) if entry.record.status_headers.get_header('content-encoding'): buff_reader = DecompressingBufferedReader(buff_reader) buff = b'' while True: new_buff = buff_reader.read() if not new_buff: break buff += new_buff if is_binary_string(buff): return b'' return buff
def test_brotli(): with open(get_test_dir() + 'text_content/quickfox_repeated.compressed', 'rb') as fh: x = DecompressingBufferedReader(fh, decomp_type='br') x.read() == b'The quick brown fox jumps over the lazy dog' * 4096
def rewrite_content(self, urlrewriter, headers, stream, head_insert_func=None, urlkey='', cdx=None): wb_url = urlrewriter.wburl if (wb_url.is_identity or (not head_insert_func and wb_url.is_banner_only)): status_headers, stream = self.sanitize_content(headers, stream) return (status_headers, self.stream_to_gen(stream), False) if wb_url.is_banner_only: urlrewriter = None rule = self.ruleset.get_first_match(urlkey) (rewritten_headers, stream) = self._rewrite_headers(urlrewriter, rule, headers, stream) status_headers = rewritten_headers.status_headers # use rewritten headers, but no further rewriting needed if rewritten_headers.text_type is None: return (status_headers, self.stream_to_gen(stream), False) # Handle text content rewriting # ==================================================================== # special case -- need to ungzip the body text_type = rewritten_headers.text_type # see known js/css modifier specified, the context should run # default text_type mod = wb_url.mod stream_raw = False encoding = None first_buff = '' if (rewritten_headers. contains_removed_header('content-encoding', 'gzip')): #optimize: if already a ChunkedDataReader, add gzip if isinstance(stream, ChunkedDataReader): stream.set_decomp('gzip') else: stream = DecompressingBufferedReader(stream) if mod == 'js_': text_type, stream = self._resolve_text_type('js', text_type, stream) elif mod == 'cs_': text_type, stream = self._resolve_text_type('css', text_type, stream) rewriter_class = rule.rewriters[text_type] # for html, need to perform header insert, supply js, css, xml # rewriters if text_type == 'html': head_insert_str = '' charset = rewritten_headers.charset # if no charset set, attempt to extract from first 1024 if not rewritten_headers.charset: first_buff = stream.read(1024) charset = self._extract_html_charset(first_buff, status_headers) if head_insert_func: if not charset: charset = 'utf-8' head_insert_str = head_insert_func(rule, cdx) head_insert_str = head_insert_str.encode(charset) if wb_url.is_banner_only: gen = self._head_insert_only_gen(head_insert_str, stream, first_buff) content_len = headers.get_header('Content-Length') try: content_len = int(content_len) except Exception: content_len = None if content_len and content_len >= 0: content_len = str(content_len + len(head_insert_str)) status_headers.replace_header('Content-Length', content_len) return (status_headers, gen, False) rewriter = rewriter_class(urlrewriter, js_rewriter_class=rule.rewriters['js'], css_rewriter_class=rule.rewriters['css'], head_insert=head_insert_str, defmod=self.defmod, parse_comments=rule.parse_comments) else: if wb_url.is_banner_only: return (status_headers, self.stream_to_gen(stream), False) # apply one of (js, css, xml) rewriters rewriter = rewriter_class(urlrewriter) # align to line end for all non-html rewriting align = (text_type != 'html') # Create rewriting generator gen = self.rewrite_text_stream_to_gen(stream, rewrite_func=rewriter.rewrite, final_read_func=rewriter.close, first_buff=first_buff, align_to_line=align) return (status_headers, gen, True)
def rewrite_content(self, urlrewriter, headers, stream, head_insert_func=None, urlkey='', cdx=None): wb_url = urlrewriter.wburl if (wb_url.is_identity or (not head_insert_func and wb_url.is_banner_only)): status_headers, stream = self.sanitize_content(headers, stream) return (status_headers, self.stream_to_gen(stream), False) if wb_url.is_banner_only: urlrewriter = None rule = self.ruleset.get_first_match(urlkey) (rewritten_headers, stream) = self._rewrite_headers(urlrewriter, rule, headers, stream) status_headers = rewritten_headers.status_headers # use rewritten headers, but no further rewriting needed if rewritten_headers.text_type is None: return (status_headers, self.stream_to_gen(stream), False) # Handle text content rewriting # ==================================================================== # special case -- need to ungzip the body text_type = rewritten_headers.text_type # see known js/css modifier specified, the context should run # default text_type mod = wb_url.mod stream_raw = False encoding = None first_buff = '' if (rewritten_headers.contains_removed_header('content-encoding', 'gzip')): #optimize: if already a ChunkedDataReader, add gzip if isinstance(stream, ChunkedDataReader): stream.set_decomp('gzip') else: stream = DecompressingBufferedReader(stream) if mod == 'js_': text_type, stream = self._resolve_text_type( 'js', text_type, stream) elif mod == 'cs_': text_type, stream = self._resolve_text_type( 'css', text_type, stream) rewriter_class = rule.rewriters[text_type] # for html, need to perform header insert, supply js, css, xml # rewriters if text_type == 'html': head_insert_str = '' charset = rewritten_headers.charset # if no charset set, attempt to extract from first 1024 if not rewritten_headers.charset: first_buff = stream.read(1024) charset = self._extract_html_charset(first_buff, status_headers) if head_insert_func: if not charset: charset = 'utf-8' head_insert_str = head_insert_func(rule, cdx) head_insert_str = head_insert_str.encode(charset) if wb_url.is_banner_only: gen = self._head_insert_only_gen(head_insert_str, stream, first_buff) content_len = headers.get_header('Content-Length') try: content_len = int(content_len) except Exception: content_len = None if content_len and content_len >= 0: content_len = str(content_len + len(head_insert_str)) status_headers.replace_header('Content-Length', content_len) return (status_headers, gen, False) rewriter = rewriter_class(urlrewriter, js_rewriter_class=rule.rewriters['js'], css_rewriter_class=rule.rewriters['css'], head_insert=head_insert_str, defmod=self.defmod, parse_comments=rule.parse_comments) else: if wb_url.is_banner_only: return (status_headers, self.stream_to_gen(stream), False) # apply one of (js, css, xml) rewriters rewriter = rewriter_class(urlrewriter) # align to line end for all non-html rewriting align = (text_type != 'html') # Create rewriting generator gen = self.rewrite_text_stream_to_gen(stream, rewrite_func=rewriter.rewrite, final_read_func=rewriter.close, first_buff=first_buff, align_to_line=align) return (status_headers, gen, True)