def test_to_str_2(): res = str(StatusAndHeadersParser(['GET']).parse(StringIO(req_headers))) assert (res == req_headers) res = str( StatusAndHeadersParser(['GET']).parse(StringIO(req_headers + '\r\n'))) assert (res == req_headers)
def test_to_str_with_remove(): res = StatusAndHeadersParser(['GET']).parse(StringIO(req_headers)) res.remove_header('Foo') exp = "\ GET / HTTP/1.0\r\n\ Content-Length: 0\r\n" assert (str(res) == exp)
def test_to_str_with_remove(): res = StatusAndHeadersParser(['GET']).parse(StringIO(req_headers)) res.remove_header('Foo') exp = "\ GET / HTTP/1.0\r\n\ Content-Length: 0\r\n" assert(str(res) == exp)
def test_to_str_exclude(): sah = StatusAndHeadersParser(['HTTP/1.0']).parse(StringIO(status_headers_1)) res = sah.to_str(['multi-line']) exp = "\ HTTP/1.0 200 OK\r\n\ Content-Type: ABC\r\n\ Some: Value\r\n\ " assert(res == exp) assert(sah.to_bytes(['multi-line']) == (exp.encode('latin-1') + b'\r\n'))
def __init__(self, loader=None, cookie_maker=None, block_size=8192): if not loader: loader = BlockLoader(cookie_maker) self.loader = loader self.block_size = block_size self.arc_parser = ARCHeadersParser(self.ARC_HEADERS) self.warc_parser = StatusAndHeadersParser(self.WARC_TYPES) self.http_parser = StatusAndHeadersParser(self.HTTP_TYPES) self.http_req_parser = StatusAndHeadersParser(self.HTTP_VERBS)
def test_to_str_exclude(): sah = StatusAndHeadersParser(['HTTP/1.0' ]).parse(StringIO(status_headers_1)) res = sah.to_str(['multi-line']) exp = "\ HTTP/1.0 200 OK\r\n\ Content-Type: ABC\r\n\ Some: Value\r\n\ " assert (res == exp) assert (sah.to_bytes(['multi-line']) == (exp.encode('latin-1') + b'\r\n'))
def __init__(self, loader=None, cookie_maker=None, block_size=8192, verify_http=True, arc2warc=True): if not loader: loader = BlockLoader(cookie_maker=cookie_maker) self.loader = loader self.block_size = block_size if arc2warc: self.arc_parser = ARC2WARCHeadersParser() else: self.arc_parser = ARCHeadersParser() self.warc_parser = StatusAndHeadersParser(self.WARC_TYPES) self.http_parser = StatusAndHeadersParser(self.HTTP_TYPES, verify_http) self.http_req_parser = StatusAndHeadersParser(self.HTTP_VERBS, verify_http)
def __init__(self, gzip=True, dedup_index=None, name='recorder', header_filter=ExcludeNone(), *args, **kwargs): self.gzip = gzip self.dedup_index = dedup_index self.rec_source_name = name self.header_filter = header_filter self.hostname = gethostname() self.parser = StatusAndHeadersParser([], verify=False)
def test_to_str_1(): res = str( StatusAndHeadersParser(['HTTP/1.0']).parse(StringIO(status_headers_1))) exp = "\ HTTP/1.0 200 OK\r\n\ Content-Type: ABC\r\n\ Some: Value\r\n\ Multi-Line: Value1 Also This\r\n\ " assert (res == exp)
def __init__(self, paths, cdx_source): self.paths = paths if isinstance(paths, six.string_types): self.paths = [paths] self.resolvers = [self._make_resolver(path) for path in self.paths] self.resolve_loader = ResolvingLoader(self.resolvers, no_record_parse=True) self.headers_parser = StatusAndHeadersParser([], verify=False) self.cdx_source = cdx_source
def _get_protocol_and_headers(self, headerline, parts): headers = [] for name, value in zip(self.headernames, parts): if name == 'WARC-Date': value = timestamp_to_iso_date(value) headers.append((name, value)) if headerline.startswith('filedesc://'): rec_type = 'arc_header' else: rec_type = 'response' headers.append(('WARC-Type', rec_type)) headers.append(('WARC-Record-ID', StatusAndHeadersParser.make_warc_id())) return ('WARC/1.0', headers)
def _get_protocol_and_headers(self, headerline, parts): headers = [] for name, value in zip(self.headernames, parts): if name == 'WARC-Date': value = timestamp_to_iso_date(value) headers.append((name, value)) if headerline.startswith('filedesc://'): rec_type = 'arc_header' else: rec_type = 'response' headers.append(('WARC-Type', rec_type)) headers.append( ('WARC-Record-ID', StatusAndHeadersParser.make_warc_id())) return ('WARC/1.0', headers)
def test_record_param_user_coll_revisit(self): warc_path = to_path(self.root_dir + '/warcs/{user}/{coll}/') dedup_index = self._get_dedup_index() recorder_app = RecorderApp(self.upstream_url, PerRecordWARCWriter(warc_path, dedup_index=dedup_index)) self._test_all_warcs('/warcs/', 2) resp = self._test_warc_write(recorder_app, 'httpbin.org', '/get?foo=bar', '¶m.recorder.user=USER¶m.recorder.coll=COLL') assert b'HTTP/1.1 200 OK' in resp.body assert b'"foo": "bar"' in resp.body self._test_all_warcs('/warcs/USER/COLL/', 2) # Test Redis CDX r = FakeStrictRedis.from_url('redis://localhost/2') res = r.zrangebylex('USER:COLL:cdxj', '[org,httpbin)/', '(org,httpbin,') assert len(res) == 2 cdx = CDXObject(res[1]) assert cdx['urlkey'] == 'org,httpbin)/get?foo=bar' assert cdx['mime'] == 'warc/revisit' assert cdx['offset'] == '0' assert cdx['filename'].startswith('USER/COLL/') assert cdx['filename'].endswith('.warc.gz') fullwarc = os.path.join(self.root_dir, 'warcs', cdx['filename']) warcs = r.hgetall('USER:COLL:warc') assert len(warcs) == 2 assert warcs[cdx['filename'].encode('utf-8')] == fullwarc.encode('utf-8') with open(fullwarc, 'rb') as fh: decomp = DecompressingBufferedReader(fh) # Test refers-to headers status_headers = StatusAndHeadersParser(['WARC/1.0']).parse(decomp) assert status_headers.get_header('WARC-Type') == 'revisit' assert status_headers.get_header('WARC-Target-URI') == 'http://httpbin.org/get?foo=bar' assert status_headers.get_header('WARC-Date') != '' assert status_headers.get_header('WARC-Refers-To-Target-URI') == 'http://httpbin.org/get?foo=bar' assert status_headers.get_header('WARC-Refers-To-Date') != ''
class BaseWARCWriter(object): WARC_RECORDS = {'warcinfo': 'application/warc-fields', 'response': 'application/http; msgtype=response', 'revisit': 'application/http; msgtype=response', 'request': 'application/http; msgtype=request', 'metadata': 'application/warc-fields', } REVISIT_PROFILE = 'http://netpreserve.org/warc/1.0/revisit/uri-agnostic-identical-payload-digest' BUFF_SIZE = 8192 FILE_TEMPLATE = 'rec-{timestamp}-{hostname}.warc.gz' def __init__(self, gzip=True, dedup_index=None, name='recorder', header_filter=ExcludeNone(), *args, **kwargs): self.gzip = gzip self.dedup_index = dedup_index self.rec_source_name = name self.header_filter = header_filter self.hostname = gethostname() self.parser = StatusAndHeadersParser([], verify=False) def ensure_digest(self, record): block_digest = record.rec_headers.get('WARC-Block-Digest') payload_digest = record.rec_headers.get('WARC-Payload-Digest') if block_digest and payload_digest: return block_digester = self._create_digester() payload_digester = self._create_digester() pos = record.stream.tell() if record.status_headers and hasattr(record.status_headers, 'headers_buff'): block_digester.update(record.status_headers.headers_buff) while True: buf = record.stream.read(self.BUFF_SIZE) if not buf: break block_digester.update(buf) payload_digester.update(buf) record.stream.seek(pos) record.rec_headers['WARC-Block-Digest'] = str(block_digester) record.rec_headers['WARC-Payload-Digest'] = str(payload_digester) def _create_digester(self): return Digester('sha1') def _set_header_buff(self, record): exclude_list = self.header_filter(record) buff = record.status_headers.to_bytes(exclude_list) record.status_headers.headers_buff = buff def write_req_resp(self, req, resp, params): url = resp.rec_headers.get('WARC-Target-URI') dt = resp.rec_headers.get('WARC-Date') #req.rec_headers['Content-Type'] = req.content_type req.rec_headers['WARC-Target-URI'] = url req.rec_headers['WARC-Date'] = dt resp_id = resp.rec_headers.get('WARC-Record-ID') if resp_id: req.rec_headers['WARC-Concurrent-To'] = resp_id resp = self._check_revisit(resp, params) if not resp: print('Skipping due to dedup') return params['_formatter'] = ParamFormatter(params, name=self.rec_source_name) self._do_write_req_resp(req, resp, params) def create_req_record(self, req_headers, payload): len_ = payload.tell() payload.seek(0) warc_headers = req_headers warc_headers['WARC-Type'] = 'request' if not warc_headers.get('WARC-Record-ID'): warc_headers['WARC-Record-ID'] = self._make_warc_id() status_headers = self.parser.parse(payload) record = ArcWarcRecord('warc', 'request', warc_headers, payload, status_headers, '', len_) self._set_header_buff(record) return record def read_resp_record(self, resp_headers, payload): len_ = payload.tell() payload.seek(0) warc_headers = self.parser.parse(payload) warc_headers = CaseInsensitiveDict(warc_headers.headers) record_type = warc_headers.get('WARC-Type', 'response') if record_type == 'response': status_headers = self.parser.parse(payload) else: status_headers = None record = ArcWarcRecord('warc', record_type, warc_headers, payload, status_headers, '', len_) if record_type == 'response': self._set_header_buff(record) self.ensure_digest(record) return record_type, record def create_warcinfo_record(self, filename, info): warc_headers = {} warc_headers['WARC-Record-ID'] = self._make_warc_id() warc_headers['WARC-Type'] = 'warcinfo' if filename: warc_headers['WARC-Filename'] = filename warc_headers['WARC-Date'] = datetime_to_iso_date(datetime.datetime.utcnow()) warcinfo = BytesIO() for n, v in six.iteritems(info): self._header(warcinfo, n, v) warcinfo.seek(0) record = ArcWarcRecord('warc', 'warcinfo', warc_headers, warcinfo, None, '', len(warcinfo.getvalue())) return record def create_custom_record(self, uri, payload, record_type, content_type, warc_headers=None): len_ = payload.tell() payload.seek(0) warc_headers = warc_headers or {} warc_headers['WARC-Record-ID'] = self._make_warc_id() warc_headers['WARC-Type'] = record_type warc_headers['WARC-Target-URI'] = uri if 'WARC-Date' not in warc_headers: warc_headers['WARC-Date'] = datetime_to_iso_date(datetime.datetime.utcnow()) record = ArcWarcRecord('warc', record_type, warc_headers, payload, None, content_type, len_) self.ensure_digest(record) return record def _check_revisit(self, record, params): if not self.dedup_index: return record try: url = record.rec_headers.get('WARC-Target-URI') digest = record.rec_headers.get('WARC-Payload-Digest') iso_dt = record.rec_headers.get('WARC-Date') result = self.dedup_index.lookup_revisit(params, digest, url, iso_dt) except Exception as e: traceback.print_exc() result = None if result == 'skip': return None if isinstance(result, tuple) and result[0] == 'revisit': record.rec_headers['WARC-Type'] = 'revisit' record.rec_headers['WARC-Profile'] = self.REVISIT_PROFILE record.rec_headers['WARC-Refers-To-Target-URI'] = result[1] record.rec_headers['WARC-Refers-To-Date'] = result[2] return record def _write_warc_record(self, out, record): if self.gzip: out = GzippingWrapper(out) self._line(out, b'WARC/1.0') for n, v in six.iteritems(record.rec_headers): if n.lower() in ('content-length', 'content-type'): continue self._header(out, n, v) content_type = record.rec_headers.get('Content-Type') if not content_type: content_type = record.content_type if not content_type: content_type = self.WARC_RECORDS.get(record.rec_headers['WARC-Type']) if content_type: self._header(out, 'Content-Type', content_type) if record.rec_headers['WARC-Type'] == 'revisit': http_headers_only = True else: http_headers_only = False if record.length: actual_len = 0 if record.status_headers: actual_len = len(record.status_headers.headers_buff) if not http_headers_only: diff = record.stream.tell() - actual_len actual_len = record.length - diff self._header(out, 'Content-Length', str(actual_len)) # add empty line self._line(out, b'') # write headers buffer, if any if record.status_headers: out.write(record.status_headers.headers_buff) if not http_headers_only: out.write(record.stream.read()) # add two lines self._line(out, b'\r\n') else: # add three lines (1 for end of header, 2 for end of record) self._line(out, b'Content-Length: 0\r\n\r\n') out.flush() def _header(self, out, name, value): if not value: return self._line(out, (name + ': ' + str(value)).encode('latin-1')) def _line(self, out, line): out.write(line + b'\r\n') @staticmethod def _make_warc_id(id_=None): if not id_: id_ = uuid.uuid1() return '<urn:uuid:{0}>'.format(id_)
class ArcWarcRecordLoader(object): WARC_TYPES = ['WARC/1.0', 'WARC/0.17', 'WARC/0.18'] HTTP_TYPES = ['HTTP/1.0', 'HTTP/1.1'] HTTP_VERBS = ['GET', 'HEAD', 'POST', 'PUT', 'DELETE', 'TRACE', 'OPTIONS', 'CONNECT', 'PATCH'] NON_HTTP_RECORDS = ('warcinfo', 'arc_header', 'metadata', 'resource') NON_HTTP_SCHEMES = ('dns:', 'whois:', 'ntp:') HTTP_SCHEMES = ('http:', 'https:') def __init__(self, loader=None, cookie_maker=None, block_size=8192, verify_http=True, arc2warc=True): if not loader: loader = BlockLoader(cookie_maker=cookie_maker) self.loader = loader self.block_size = block_size if arc2warc: self.arc_parser = ARC2WARCHeadersParser() else: self.arc_parser = ARCHeadersParser() self.warc_parser = StatusAndHeadersParser(self.WARC_TYPES) self.http_parser = StatusAndHeadersParser(self.HTTP_TYPES, verify_http) self.http_req_parser = StatusAndHeadersParser(self.HTTP_VERBS, verify_http) def load(self, url, offset, length, no_record_parse=False): """ Load a single record from given url at offset with length and parse as either warc or arc record """ try: length = int(length) except: length = -1 stream = self.loader.load(url, int(offset), length) decomp_type = 'gzip' # Create decompressing stream stream = DecompressingBufferedReader(stream=stream, decomp_type=decomp_type, block_size=self.block_size) return self.parse_record_stream(stream, no_record_parse=no_record_parse) def parse_record_stream(self, stream, statusline=None, known_format=None, no_record_parse=False): """ Parse file-like stream and return an ArcWarcRecord encapsulating the record headers, http headers (if any), and a stream limited to the remainder of the record. Pass statusline and known_format to detect_type_loader_headers() to faciliate parsing. """ (the_format, rec_headers) = (self. _detect_type_load_headers(stream, statusline, known_format)) if the_format == 'arc': uri = rec_headers.get_header('uri') length = rec_headers.get_header('length') content_type = rec_headers.get_header('content-type') sub_len = rec_headers.total_len if uri and uri.startswith('filedesc://'): rec_type = 'arc_header' else: rec_type = 'response' elif the_format in ('warc', 'arc2warc'): rec_type = rec_headers.get_header('WARC-Type') uri = rec_headers.get_header('WARC-Target-URI') length = rec_headers.get_header('Content-Length') content_type = rec_headers.get_header('Content-Type') if the_format == 'warc': sub_len = 0 else: sub_len = rec_headers.total_len the_format = 'warc' is_err = False try: if length is not None: length = int(length) - sub_len if length < 0: is_err = True except (ValueError, TypeError): is_err = True # err condition if is_err: length = 0 # limit stream to the length for all valid records if length is not None and length >= 0: stream = LimitReader.wrap_stream(stream, length) # don't parse the http record at all if no_record_parse: status_headers = None#StatusAndHeaders('', []) # if empty record (error or otherwise) set status to 204 elif length == 0: if is_err: msg = '204 Possible Error' else: msg = '204 No Content' status_headers = StatusAndHeaders(msg, []) # response record or non-empty revisit: parse HTTP status and headers! elif (rec_type in ('response', 'revisit') and uri.startswith(self.HTTP_SCHEMES)): status_headers = self.http_parser.parse(stream) # request record: parse request elif ((rec_type == 'request') and uri.startswith(self.HTTP_SCHEMES)): status_headers = self.http_req_parser.parse(stream) # everything else: create a no-status entry, set content-type else: content_type_header = [('Content-Type', content_type)] if length is not None and length >= 0: content_type_header.append(('Content-Length', str(length))) status_headers = StatusAndHeaders('200 OK', content_type_header) return ArcWarcRecord(the_format, rec_type, rec_headers, stream, status_headers, content_type, length) def _detect_type_load_headers(self, stream, statusline=None, known_format=None): """ If known_format is specified ('warc' or 'arc'), parse only as that format. Otherwise, try parsing record as WARC, then try parsing as ARC. if neither one succeeds, we're out of luck. """ if known_format != 'arc': # try as warc first try: rec_headers = self.warc_parser.parse(stream, statusline) return 'warc', rec_headers except StatusAndHeadersParserException as se: if known_format == 'warc': msg = 'Invalid WARC record, first line: ' raise ArchiveLoadFailed(msg + str(se.statusline)) statusline = se.statusline pass # now try as arc try: rec_headers = self.arc_parser.parse(stream, statusline) return self.arc_parser.get_rec_type(), rec_headers except StatusAndHeadersParserException as se: if known_format == 'arc': msg = 'Invalid ARC record, first line: ' else: msg = 'Unknown archive format, first line: ' raise ArchiveLoadFailed(msg + str(se.statusline))
def __init__(self, env): self.env = env parser = StatusAndHeadersParser([], verify=False) self.status_headers = parser.parse(self.env['wsgi.input'])
def test_status_empty(): with pytest.raises(EOFError): StatusAndHeadersParser([], verify=False).parse(StringIO(''))
class ArcWarcRecordLoader(object): # Standard ARC v1.0 headers # TODO: support ARC v2.0 also? ARC_HEADERS = [ "uri", "ip-address", "archive-date", "content-type", "length" ] WARC_TYPES = ['WARC/1.0', 'WARC/0.17', 'WARC/0.18'] HTTP_TYPES = ['HTTP/1.0', 'HTTP/1.1'] HTTP_VERBS = [ 'GET', 'HEAD', 'POST', 'PUT', 'DELETE', 'TRACE', 'OPTIONS', 'CONNECT', 'PATCH' ] NON_HTTP_RECORDS = ('warcinfo', 'arc_header', 'metadata', 'resource') NON_HTTP_SCHEMES = ('dns:', 'whois:', 'ntp:') HTTP_SCHEMES = ('http:', 'https:') def __init__(self, loader=None, cookie_maker=None, block_size=8192, verify_http=True): if not loader: loader = BlockLoader(cookie_maker) self.loader = loader self.block_size = block_size self.arc_parser = ARCHeadersParser(self.ARC_HEADERS) self.warc_parser = StatusAndHeadersParser(self.WARC_TYPES) self.http_parser = StatusAndHeadersParser(self.HTTP_TYPES, verify_http) self.http_req_parser = StatusAndHeadersParser(self.HTTP_VERBS, verify_http) def load(self, url, offset, length, no_record_parse=False): """ Load a single record from given url at offset with length and parse as either warc or arc record """ try: length = int(length) except: length = -1 stream = self.loader.load(url, int(offset), length) decomp_type = 'gzip' # Create decompressing stream stream = DecompressingBufferedReader(stream=stream, decomp_type=decomp_type, block_size=self.block_size) return self.parse_record_stream(stream, no_record_parse=no_record_parse) def parse_record_stream(self, stream, statusline=None, known_format=None, no_record_parse=False): """ Parse file-like stream and return an ArcWarcRecord encapsulating the record headers, http headers (if any), and a stream limited to the remainder of the record. Pass statusline and known_format to detect_type_loader_headers() to faciliate parsing. """ (the_format, rec_headers) = (self._detect_type_load_headers( stream, statusline, known_format)) if the_format == 'arc': uri = rec_headers.get_header('uri') length = rec_headers.get_header('length') content_type = rec_headers.get_header('content-type') sub_len = rec_headers.total_len if uri and uri.startswith('filedesc://'): rec_type = 'arc_header' else: rec_type = 'response' elif the_format == 'warc': rec_type = rec_headers.get_header('WARC-Type') uri = rec_headers.get_header('WARC-Target-URI') length = rec_headers.get_header('Content-Length') content_type = rec_headers.get_header('Content-Type') sub_len = 0 is_err = False try: if length is not None: length = int(length) - sub_len if length < 0: is_err = True except (ValueError, TypeError): is_err = True # err condition if is_err: length = 0 # limit stream to the length for all valid records if length is not None and length >= 0: stream = LimitReader.wrap_stream(stream, length) # don't parse the http record at all if no_record_parse: status_headers = None #StatusAndHeaders('', []) # if empty record (error or otherwise) set status to 204 elif length == 0: if is_err: msg = '204 Possible Error' else: msg = '204 No Content' status_headers = StatusAndHeaders(msg, []) # response record or non-empty revisit: parse HTTP status and headers! elif (rec_type in ('response', 'revisit') and uri.startswith(self.HTTP_SCHEMES)): status_headers = self.http_parser.parse(stream) # request record: parse request elif ((rec_type == 'request') and uri.startswith(self.HTTP_SCHEMES)): status_headers = self.http_req_parser.parse(stream) # everything else: create a no-status entry, set content-type else: content_type_header = [('Content-Type', content_type)] if length is not None and length >= 0: content_type_header.append(('Content-Length', str(length))) status_headers = StatusAndHeaders('200 OK', content_type_header) return ArcWarcRecord(the_format, rec_type, rec_headers, stream, status_headers, content_type, length) def _detect_type_load_headers(self, stream, statusline=None, known_format=None): """ If known_format is specified ('warc' or 'arc'), parse only as that format. Otherwise, try parsing record as WARC, then try parsing as ARC. if neither one succeeds, we're out of luck. """ if known_format != 'arc': # try as warc first try: rec_headers = self.warc_parser.parse(stream, statusline) return 'warc', rec_headers except StatusAndHeadersParserException as se: if known_format == 'warc': msg = 'Invalid WARC record, first line: ' raise ArchiveLoadFailed(msg + str(se.statusline)) statusline = se.statusline pass # now try as arc try: rec_headers = self.arc_parser.parse(stream, statusline) return 'arc', rec_headers except StatusAndHeadersParserException as se: if known_format == 'arc': msg = 'Invalid ARC record, first line: ' else: msg = 'Unknown archive format, first line: ' raise ArchiveLoadFailed(msg + str(se.statusline))
def test_status_one_word(): res = StatusAndHeadersParser(['GET'], verify=False).parse(StringIO('A')) assert (str(res) == 'A\r\n')
class WARCPathLoader(BaseLoader): def __init__(self, paths, cdx_source): self.paths = paths if isinstance(paths, six.string_types): self.paths = [paths] self.resolvers = [self._make_resolver(path) for path in self.paths] self.resolve_loader = ResolvingLoader(self.resolvers, no_record_parse=True) self.headers_parser = StatusAndHeadersParser([], verify=False) self.cdx_source = cdx_source def _make_resolver(self, path): if hasattr(path, '__call__'): return path if path.startswith('redis://'): return RedisResolver(path) else: return PrefixResolver(path) def load_resource(self, cdx, params): if cdx.get('_cached_result'): return cdx.get('_cached_result') if not cdx.get('filename') or cdx.get('offset') is None: return None orig_source = cdx.get('source', '').split(':')[0] formatter = ParamFormatter(params, orig_source) cdx._formatter = formatter def local_index_query(local_params): for n, v in six.iteritems(params): if n.startswith('param.'): local_params[n] = v cdx_iter, errs = self.cdx_source(local_params) for cdx in cdx_iter: cdx._formatter = formatter yield cdx return cdx_iter failed_files = [] headers, payload = (self.resolve_loader. load_headers_and_payload(cdx, failed_files, local_index_query)) status = cdx.get('status') if not status or status.startswith('3'): status_headers = self.headers_parser.parse(payload.stream) self.raise_on_self_redirect(params, cdx, status_headers.get_statuscode(), status_headers.get_header('Location')) http_headers_buff = status_headers.to_bytes() else: http_headers_buff = None warc_headers = payload.rec_headers if headers != payload: warc_headers.replace_header('WARC-Refers-To-Target-URI', payload.rec_headers.get_header('WARC-Target-URI')) warc_headers.replace_header('WARC-Refers-To-Date', payload.rec_headers.get_header('WARC-Date')) warc_headers.replace_header('WARC-Target-URI', headers.rec_headers.get_header('WARC-Target-URI')) warc_headers.replace_header('WARC-Date', headers.rec_headers.get_header('WARC-Date')) headers.stream.close() return (warc_headers, http_headers_buff, payload.stream) def __str__(self): return 'WARCPathLoader'