def process(record, out, options): ignore_headers = WGET_IGNORE_HEADERS if options.wget_workaround else () if options.decode_http: if record.type == WarcRecord.RESPONSE: content_type, content = record.content message = None if content_type == ResponseMessage.CONTENT_TYPE: # technically, a http request needs to know the request to be parsed # because responses to head requests don't have a body. # we assume we don't store 'head' responses, and plough on message = ResponseMessage(RequestMessage(), ignore_headers=ignore_headers) if content_type == RequestMessage.CONTENT_TYPE: message = RequestMessage(ignore_headers=ignore_headers) if message: leftover = message.feed(content) message.close() if not leftover and message.complete(): content = message.get_decoded_message() record.content = content_type, content else: error = [] if leftover: error.append("%d bytes unparsed" % len(leftover)) if not message.complete(): error.append("incomplete message (at %s, %s)" % (message.mode, message.header.mode)) print >> sys.stderr, 'errors decoding http in record', record.id, ",".join( error) record.write_to(out, gzip=options.gzip)
def process(record, out, options): ignore_headers = WGET_IGNORE_HEADERS if options.wget_workaround else () if options.decode_http: if record.type == WarcRecord.RESPONSE: content_type, content = record.content message = None if content_type == ResponseMessage.CONTENT_TYPE: # technically, a http request needs to know the request to be parsed # because responses to head requests don't have a body. # we assume we don't store 'head' responses, and plough on message = ResponseMessage(RequestMessage(), ignore_headers=ignore_headers) if content_type == RequestMessage.CONTENT_TYPE: message = RequestMessage(ignore_headers=ignore_headers) if message: leftover = message.feed(content) message.close() if not leftover and message.complete(): content = message.get_decoded_message() record.content = content_type, content else: error = [] if leftover: error.append("%d bytes unparsed"%len(leftover)) if not message.complete(): error.append("incomplete message (at %s, %s)"%(message.mode, message.header.mode)) print >> sys.stderr, 'errors decoding http in record', record.id, ",".join(error) record.write_to(out, gzip=options.gzip)
def process(record, previous_record, out, options, found_hrefs): ignore_headers = WGET_IGNORE_HEADERS if options.wget_workaround else () if options.decode_http: if record.type == WarcRecord.RESPONSE: content_type, content = record.content message = None if content_type == ResponseMessage.CONTENT_TYPE: # technically, a http request needs to know the request to be parsed # because responses to head requests don't have a body. # we assume we don't store 'head' responses, and plough on message = ResponseMessage(RequestMessage(), ignore_headers=ignore_headers) if content_type == RequestMessage.CONTENT_TYPE: message = RequestMessage(ignore_headers=ignore_headers) if message: leftover = message.feed(content) message.close() ##print "Code", message.header.code if not leftover and message.complete(): content = message.get_decoded_message() if found_hrefs is not None and message.header.code == 200: found_hrefs.update(match[12:-2] for match in JSON_HREF_RE.findall(content)) record.content = content_type, content else: error = [] if leftover: error.append("%d bytes unparsed"%len(leftover)) if not message.complete(): error.append("incomplete message (at %s, %s)"%(message.mode, message.header.mode)) print >> sys.stderr, 'errors decoding http in record', record.id, ",".join(error) if options.strip_404s: # We don't write out a request until we confirm its associated response is not 404 if record.type == WarcRecord.REQUEST: pass elif record.type == WarcRecord.RESPONSE: if message.header.code == 404: # If 404, don't write out either the request or the response pass else: if previous_record is None: raise RuntimeError("Need to write out previous record as well, but it isn't present") if previous_record.type != WarcRecord.REQUEST: raise RuntimeError("Expected previous record to be a " "WarcRecord.REQUEST, was a %r" % (previous_record.type,)) # Note that if a request is made multiple times, we will only write out the last # attempt at it. previous_record.write_to(out, gzip=options.gzip) record.write_to(out, gzip=options.gzip) else: # metadata record.write_to(out, gzip=options.gzip) else: record.write_to(out, gzip=options.gzip)
def process(record, previous_record, out, options, found_hrefs): ignore_headers = WGET_IGNORE_HEADERS if options.wget_workaround else () if options.decode_http: if record.type == WarcRecord.RESPONSE: content_type, content = record.content message = None if content_type == ResponseMessage.CONTENT_TYPE: # technically, a http request needs to know the request to be parsed # because responses to head requests don't have a body. # we assume we don't store 'head' responses, and plough on message = ResponseMessage(RequestMessage(), ignore_headers=ignore_headers) if content_type == RequestMessage.CONTENT_TYPE: message = RequestMessage(ignore_headers=ignore_headers) if message: leftover = message.feed(content) message.close() ##print "Code", message.header.code if not leftover and message.complete(): content = message.get_decoded_message() if found_hrefs is not None and message.header.code == 200: found_hrefs.update( match[12:-2] for match in JSON_HREF_RE.findall(content)) record.content = content_type, content else: error = [] if leftover: error.append("%d bytes unparsed" % len(leftover)) if not message.complete(): error.append("incomplete message (at %s, %s)" % (message.mode, message.header.mode)) print >> sys.stderr, 'errors decoding http in record', record.id, ",".join( error) if options.strip_404s: # We don't write out a request until we confirm its associated response is not 404 if record.type == WarcRecord.REQUEST: pass elif record.type == WarcRecord.RESPONSE: if message.header.code == 404: # If 404, don't write out either the request or the response pass else: if previous_record is None: raise RuntimeError( "Need to write out previous record as well, but it isn't present" ) if previous_record.type != WarcRecord.REQUEST: raise RuntimeError("Expected previous record to be a " "WarcRecord.REQUEST, was a %r" % (previous_record.type, )) # Note that if a request is made multiple times, we will only write out the last # attempt at it. previous_record.write_to(out, gzip=options.gzip) record.write_to(out, gzip=options.gzip) else: # metadata record.write_to(out, gzip=options.gzip) else: record.write_to(out, gzip=options.gzip)