def writeRecordToTransport(r, t): m = ResponseMessage(RequestMessage()) m.feed(r.content[1]) m.close() b = m.get_body() # construct new headers new_headers = [] old_headers = [] for k, v in m.header.headers: if not k.lower() in ("connection", "content-length", "cache-control", "accept-ranges", "etag", "last-modified", "transfer-encoding"): new_headers.append((k, v)) old_headers.append(("X-Archive-Orig-%s" % k, v)) new_headers.append(("Content-Length", "%d" % len(b))) new_headers.append(("Connection", "keep-alive")) # write the response t.write("%s %d %s\r\n" % (m.header.version, m.header.code, m.header.phrase)) h = new_headers + old_headers t.write("\r\n".join(["%s: %s" % (k, v) for k, v in h])) t.write("\r\n\r\n") t.write(b)
def parse_http_response_charset(record): """Parses the payload of an HTTP 'response' record, returning code, content type, declared character set and body. Adapted from github's internetarchive/warctools hanzo/warcfilter.py, commit 1850f328e31e505569126b4739cec62ffa444223. MIT licenced.""" message = ResponseMessage(RequestMessage()) remainder = message.feed(record.content[1]) message.close() if remainder or not message.complete(): if remainder: print 'trailing data in http response for', record.url if not message.complete(): print 'truncated http response for', record.url header = message.header mime_type = [v for k,v in header.headers if k.lower() == b'content-type'] charset = None if mime_type: match = re.search(r'charset=(\S+)', mime_type[0], re.I) if match: charset = match.group(1).lower() mime_type = mime_type[0].split(b';')[0] else: mime_type = None return header.code, mime_type, charset, message.get_body()
def iter_zip(self): with ZipFile(self, "w") as outzip: for (offset, record, errors) in self.archive.read_records(limit=None): if record and record.type == WarcRecord.RESPONSE and re.sub(r'\s+', '', record.content[0]) == ResponseMessage.CONTENT_TYPE: message = ResponseMessage(RequestMessage()) leftover = message.feed(record.content[1]) message.close() filename = self.url_to_filename(record.url) date_time = record.date date_time = (int(date_time[0:4]), int(date_time[5:7]), int(date_time[8:10]), int(date_time[11:13]), int(date_time[14:16]), int(date_time[17:19])) info = ZipInfo(filename, date_time) outzip.writestr(info, message.get_body()) self.files[filename] = record.url for chunk in self.buffer: yield(chunk) self.buffer = [] elif errors: self.errors.append("warc errors at %s:%d"%(name, offset if offset else 0)) for e in errors: self.errors.append(e) outzip.writestr("files.txt", "\n".join([ "%s -> %s" % (v,k) for k,v in self.files.iteritems() ])) if len(self.errors) > 0: outzip.writestr("errors.txt", "\n".join(self.errors)) for chunk in self.buffer: yield(chunk) self.buffer = []
def __call__(self, request): """Called by HTTPServer to execute the request.""" web_match = re.match(self.WEB_RE, request.uri) if not web_match: web_match = re.match(self.WEB_VIA_PROXY_RE, request.uri) if web_match: request.host = "warc" request.uri = web_match.group("uri") request.path, sep, query = request.uri.partition("?") self.web_handler.__call__(request) else: with self.proxy_handler.warc_record_for_uri( canonicalize_url(request.uri)) as record: if record: print "Serving %s from WARC" % request.uri # parse the response message = ResponseMessage(RequestMessage()) message.feed(record[1].content[1]) message.close() body = message.get_body() # construct new headers new_headers = [] old_headers = [] for k, v in message.header.headers: if not k.lower() in ("connection", "content-length", "cache-control", "accept-ranges", "etag", "last-modified", "transfer-encoding"): new_headers.append((k, v)) old_headers.append(("X-Archive-Orig-%s" % k, v)) new_headers.append(("Content-Length", "%d" % len(body))) new_headers.append(("Connection", "keep-alive")) # write the response request.write("%s %d %s\r\n" % (message.header.version, message.header.code, message.header.phrase)) request.write("\r\n".join([ "%s: %s" % (k, v) for k, v in (new_headers + old_headers) ])) request.write("\r\n\r\n") request.write(body) else: print "Could not find %s in WARC" % request.uri request.write( "HTTP/1.0 404 Not Found\r\nConnection: keep-alive\r\nContent-Type: text/plain\r\nContent-Length: 91\r\n\r\nThis URL is not in any of your archives. Close the WARC viewer to resume normal browsing.\r\n" ) request.finish()
def dump_record(fh, outzip): for (offset, record, errors) in fh.read_records(limit=None): if record and record.type == WarcRecord.RESPONSE and record.content[0] == ResponseMessage.CONTENT_TYPE: message = ResponseMessage(RequestMessage()) leftover = message.feed(record.content[1]) message.close() outzip.writestr(re.sub(r'^https?://', '', record.url), message.get_body()) print(record.url) elif errors: print >> sys.stderr, "warc errors at %s:%d"%(name, offset if offset else 0) for e in errors: print '\t', e
def extractPayload(record): """ :type record: WarcRecord """ m = ResponseMessage(RequestMessage()) m.feed(record.content[1]) m.close() b = m.get_body() z = zlib.decompressobj(16 + zlib.MAX_WBITS) try: b = z.decompress(b) except zlib.error: pass return b
def dump_record(fh, outzip): for (offset, record, errors) in fh.read_records(limit=None): if record and record.type == WarcRecord.RESPONSE and record.content[ 0] == ResponseMessage.CONTENT_TYPE: message = ResponseMessage(RequestMessage()) leftover = message.feed(record.content[1]) message.close() outzip.writestr(re.sub(r'^https?://', '', record.url), message.get_body()) print(record.url) elif errors: print >> sys.stderr, "warc errors at %s:%d" % (name, offset if offset else 0) for e in errors: print '\t', e
def __call__(self, request): """Called by HTTPServer to execute the request.""" web_match = re.match(self.WEB_RE, request.uri) if not web_match: web_match = re.match(self.WEB_VIA_PROXY_RE, request.uri) if web_match: request.host = "warc" request.uri = web_match.group("uri") request.path, sep, query = request.uri.partition("?") self.web_handler.__call__(request) else: with self.proxy_handler.warc_record_for_uri(canonicalize_url(request.uri)) as record: if record: print "Serving %s from WARC" % request.uri # parse the response message = ResponseMessage(RequestMessage()) message.feed(record[1].content[1]) message.close() body = message.get_body() # construct new headers new_headers = [] old_headers = [] for k, v in message.header.headers: if not k.lower() in ("connection", "content-length", "cache-control", "accept-ranges", "etag", "last-modified", "transfer-encoding"): new_headers.append((k, v)) old_headers.append(("X-Archive-Orig-%s" % k, v)) new_headers.append(("Content-Length", "%d" % len(body))) new_headers.append(("Connection", "keep-alive")) # write the response request.write("%s %d %s\r\n" % (message.header.version, message.header.code, message.header.phrase)) request.write("\r\n".join([ "%s: %s" % (k,v) for k,v in (new_headers + old_headers) ])) request.write("\r\n\r\n") request.write(body) else: print "Could not find %s in WARC" % request.uri request.write("HTTP/1.0 404 Not Found\r\nConnection: keep-alive\r\nContent-Type: text/plain\r\nContent-Length: 91\r\n\r\nThis URL is not in any of your archives. Close the WARC viewer to resume normal browsing.\r\n") request.finish()
def iter_zip(self): with ZipFile(self, "w") as outzip: for (offset, record, errors) in self.archive.read_records(limit=None): if record and record.type == WarcRecord.RESPONSE and re.sub( r'\s+', '', record.content[0]) == ResponseMessage.CONTENT_TYPE: message = ResponseMessage(RequestMessage()) leftover = message.feed(record.content[1]) message.close() filename = self.url_to_filename(record.url) date_time = record.date date_time = (int(date_time[0:4]), int(date_time[5:7]), int(date_time[8:10]), int(date_time[11:13]), int(date_time[14:16]), int(date_time[17:19])) info = ZipInfo(filename, date_time) outzip.writestr(info, message.get_body()) self.files[filename] = record.url for chunk in self.buffer: yield (chunk) self.buffer = [] elif errors: self.errors.append("warc errors at %s:%d" % (name, offset if offset else 0)) for e in errors: self.errors.append(e) outzip.writestr( "files.txt", "\n".join( ["%s -> %s" % (v, k) for k, v in self.files.iteritems()])) if len(self.errors) > 0: outzip.writestr("errors.txt", "\n".join(self.errors)) for chunk in self.buffer: yield (chunk) self.buffer = []
def parse_http_response_charset(record): """Parses the payload of an HTTP 'response' record, returning code, content type and body. Adapted from github's internetarchive/warctools hanzo/warcfilter.py, commit 1850f328e31e505569126b4739cec62ffa444223. MIT licenced.""" message = ResponseMessage(RequestMessage()) remainder = message.feed(record.content[1]) message.close() if remainder or not message.complete(): if remainder: raise Exception('trailing data in http response for'+str(record.url)) if not message.complete(): print Exception('truncated http response for'+str(record.url)) header = message.header mime_type = [v for k,v in header.headers if k.lower() == b'content-type'] if mime_type: mime_type, charset = mime_type[0].split(b';') else: mime_type = None return header.code, mime_type, message.get_body()
def parse_http_response(record): """Parses the payload of an HTTP 'response' record, returning code, content type and body. Adapted from github's internetarchive/warctools hanzo/warcfilter.py, commit 1850f328e31e505569126b4739cec62ffa444223. MIT licenced.""" message = ResponseMessage(RequestMessage()) remainder = message.feed(record.content[1]) message.close() if remainder or not message.complete(): if remainder: print 'trailing data in http response for', record.url if not message.complete(): print 'truncated http response for', record.url header = message.header mime_type = [v for k, v in header.headers if k.lower() == b'content-type'] if mime_type: mime_type = mime_type[0].split(b';')[0] else: mime_type = None return header.code, mime_type, message.get_body()
def _init_from_warc_record(self, warc_record): self._warc_record = warc_record self.open(QNetworkReply.ReadOnly | QNetworkReply.Unbuffered) self.setUrl(QUrl(str_to_qstring(self._warc_record.url))) rs = ResponseMessage(RequestMessage()) rs.feed(self._warc_record.content[1]) for name, value in rs.header.headers: self.setRawHeader(name, value) self.setAttribute(QNetworkRequest.HttpStatusCodeAttribute, \ rs.header.code) self.setAttribute(QNetworkRequest.HttpReasonPhraseAttribute, \ rs.header.phrase) self._check_for_redirect(rs.header.code) QTimer.singleShot(0, lambda: self.metaDataChanged.emit()) self._data = rs.get_body() QTimer.singleShot(0, lambda: self.readyRead.emit()) QTimer.singleShot(0, lambda: self.finished.emit())