def load(self, url, offset, length): url_parts = urlparse.urlsplit(url) try: loader = self.loaders.get(url_parts.scheme) except Exception: raise wbexceptions.UnknownLoaderProtocolException(url) the_format = None for ext, iformat in ArchiveLoader.FORMAT_MAP.iteritems(): if url.endswith(ext): the_format = iformat break if the_format is None: raise wbexceptions.UnknownArchiveFormatException(url) (a_format, is_gzip) = the_format decomp = utils.create_decompressor() if is_gzip else None try: length = int(length) except: length = -1 raw = loader.load(url, long(offset), length) stream = LineReader(raw, length, self.chunk_size, decomp) if a_format == 'arc': rec_headers = self.arc_parser.parse(stream) rec_type = 'response' empty = (rec_headers.get_header('length') == 0) elif a_format == 'warc': rec_headers = self.warc_parser.parse(stream) rec_type = rec_headers.get_header('WARC-Type') empty = (rec_headers.get_header('Content-Length') == '0') # special case: empty w/arc record (hopefully a revisit) if empty: status_headers = StatusAndHeaders('204 No Content', []) # special case: warc records that are not expected to have http headers # attempt to add 200 status and content-type elif rec_type == 'metadata' or rec_type == 'resource': status_headers = StatusAndHeaders('200 OK', [('Content-Type', rec_headers.get_header('Content-Type'))]) # special case: http 0.9 response, no status or headers #elif rec_type == 'response': # content_type = rec_headers.get_header('Content-Type') # if content_type and (';version=0.9' in content_type): # status_headers = StatusAndHeaders('200 OK', []) # response record: parse HTTP status and headers! else: #(statusline, http_headers) = self.parse_http_headers(stream) status_headers = self.http_parser.parse(stream) return WBArchiveRecord((a_format, rec_type), rec_headers, stream, status_headers)
def make_response(self, wbrequest, cdx, status_headers, stream, static_path): # check and reject self-redirect self._reject_self_redirect(wbrequest, cdx, status_headers) # check if redir is needed self._redirect_if_needed(wbrequest, cdx) urlrewriter = wbrequest.urlrewriter rewritten_headers = self.header_rewriter.rewrite(status_headers, urlrewriter) # de_chunking in case chunk encoding is broken # TODO: investigate further de_chunk = False # handle transfer-encoding: chunked if (rewritten_headers.contains_removed_header('transfer-encoding', 'chunked')): stream = archiveloader.ChunkedLineReader(stream) de_chunk = True # transparent, though still may need to dechunk if wbrequest.wb_url.mod == 'id_': if de_chunk: status_headers.remove_header('transfer-encoding') return self.create_stream_response(status_headers, stream) # non-text content type, just send through with rewritten headers # but may need to dechunk if rewritten_headers.text_type is None: status_headers = rewritten_headers.status_headers return self.create_stream_response(status_headers, stream) # Handle text rewriting # special case -- need to ungzip the body if (rewritten_headers.contains_removed_header('content-encoding', 'gzip')): stream = archiveloader.LineReader(stream, decomp = utils.create_decompressor()) # TODO: is this right? if rewritten_headers.charset: encoding = rewritten_headers.charset first_buff = None else: (encoding, first_buff) = self._detect_charset(stream) # if chardet thinks its ascii, use utf-8 if encoding == 'ascii': #encoding = None encoding = 'utf-8' # Buffering response for html, streaming for others? #if rewritten_headers.text_type == 'html': # return self._rewrite_html(encoding, urlrewriter, stream, rewritten_headers.status_headers, first_buff) #else: # return self._rewrite_other(rewritten_headers.text_type, encoding, urlrewriter, stream, rewritten_headers.status_headers, first_buff) text_type = rewritten_headers.text_type status_headers = rewritten_headers.status_headers if text_type == 'html': head_insert_str = self.head_insert_view.render_to_string(wbrequest = wbrequest, cdx = cdx, static_path = static_path) if self.head_insert_view else None rewriter = html_rewriter.HTMLRewriter(urlrewriter, outstream = None, head_insert = head_insert_str) elif text_type == 'css': rewriter = regex_rewriters.CSSRewriter(urlrewriter) elif text_type == 'js': rewriter = regex_rewriters.JSRewriter(urlrewriter) elif text_type == 'xml': rewriter = regex_rewriters.XMLRewriter(urlrewriter) else: raise Exception('Unknown Text Type for Rewrite: ' + text_type) # Create generator for response response_gen = self._create_rewrite_stream(rewriter, encoding, stream, first_buff) if self.buffer_response: return self._create_buffer_response(status_headers, response_gen) else: return WbResponse(status_headers, value = response_gen)