def test_to_str_2(): res = str(StatusAndHeadersParser(['GET']).parse(StringIO(req_headers))) assert(res == req_headers) res = str(StatusAndHeadersParser(['GET']).parse(StringIO(req_headers + '\r\n'))) assert(res == req_headers)
def __init__(self, gzip=True, *args, **kwargs): self.gzip = gzip self.hostname = gethostname() self.parser = StatusAndHeadersParser([], verify=False) self.warc_version = kwargs.get('warc_version', self.WARC_VERSION) self.header_filter = kwargs.get('header_filter')
def _check_uri_date(self, resp, uri, dt): buff = BytesIO(resp.body) buff = ChunkedDataReader(buff) status_headers = StatusAndHeadersParser(['WARC/1.0']).parse(buff) assert status_headers.get_header('WARC-Target-URI') == uri if dt == True: assert status_headers.get_header('WARC-Date') != '' else: assert status_headers.get_header('WARC-Date') == dt
def test_to_str_with_remove(): res = StatusAndHeadersParser(['GET']).parse(StringIO(req_headers)) res.remove_header('Foo') exp = "\ GET / HTTP/1.0\r\n\ Content-Length: 0\r\n" assert (str(res) == exp)
def __init__(self, paths, cdx_source): self.paths = paths self.resolvers = self.make_resolvers(self.paths) self.resolve_loader = ResolvingLoader(self.resolvers, no_record_parse=True) self.headers_parser = StatusAndHeadersParser([], verify=False) self.cdx_source = cdx_source
def __init__(self, gzip=True, *args, **kwargs): super(BaseWARCWriter, self).__init__(warc_version=kwargs.get('warc_version'), header_filter=kwargs.get('header_filter')) self.gzip = gzip self.hostname = gethostname() self.parser = StatusAndHeadersParser([], verify=False)
def test_to_str_exclude(): def exclude(h): if h[0].lower() == 'multi-line': return None return h sah = StatusAndHeadersParser(['HTTP/1.0']).parse(StringIO(status_headers_1)) res = sah.to_str(exclude) exp = "\ HTTP/1.0 200 OK\r\n\ Content-Type: ABC\r\n\ Some: Value\r\n\ " assert(res == exp) assert(sah.to_bytes(exclude) == (exp.encode('latin-1') + b'\r\n'))
def test_agg_local_revisit(self): resp = self.testapp.get('/many/resource?url=http://www.example.com/&closest=20140127171251&sources=local') assert resp.headers['Warcserver-Source-Coll'] == 'local:dupes.cdxj' buff = BytesIO(resp.body) status_headers = StatusAndHeadersParser(['WARC/1.0']).parse(buff) assert status_headers.get_header('WARC-Target-URI') == 'http://example.com' assert status_headers.get_header('WARC-Date') == '2014-01-27T17:12:51Z' assert status_headers.get_header('WARC-Refers-To-Target-URI') == 'http://example.com' assert status_headers.get_header('WARC-Refers-To-Date') == '2014-01-27T17:12:00Z' assert resp.headers['Link'] == MementoUtils.make_link('http://example.com', 'original') assert resp.headers['Memento-Datetime'] == 'Mon, 27 Jan 2014 17:12:51 GMT' assert b'HTTP/1.1 200 OK' in resp.body assert b'<!doctype html>' in resp.body assert 'ResErrors' not in resp.headers
def test_to_str_1(): res = str(StatusAndHeadersParser(['HTTP/1.0']).parse(StringIO(status_headers_1))) exp = "\ HTTP/1.0 200 OK\r\n\ Content-Type: ABC\r\n\ Some: Value\r\n\ Multi-Line: Value1 Also This\r\n\ " assert(res == exp)
def test_record_param_user_coll_revisit(self): warc_path = to_path(self.root_dir + '/warcs/{user}/{coll}/') dedup_index = self._get_dedup_index() recorder_app = RecorderApp( self.upstream_url, PerRecordWARCWriter(warc_path, dedup_index=dedup_index)) self._test_all_warcs('/warcs/USER/COLL/', 1) resp = self._test_warc_write( recorder_app, 'httpbin.org', '/user-agent', '¶m.recorder.user=USER¶m.recorder.coll=COLL') assert '"user-agent": "{0}"'.format(UA) in resp.text #assert b'HTTP/1.1 200 OK' in resp.body #assert b'"foo": "bar"' in resp.body self._test_all_warcs('/warcs/USER/COLL/', 2) # Test Redis CDX r = FakeStrictRedis.from_url('redis://localhost/2') res = r.zrangebylex('USER:COLL:cdxj', '[org,httpbin)/', '(org,httpbin,') assert len(res) == 2 if b'warc/revisit' in res[0]: cdx = CDXObject(res[0]) else: cdx = CDXObject(res[1]) assert cdx['urlkey'] == 'org,httpbin)/user-agent' assert cdx['mime'] == 'warc/revisit' assert cdx['offset'] == '0' assert cdx['filename'].startswith(to_path('USER/COLL/')) assert cdx['filename'].endswith('.warc.gz') fullwarc = os.path.join(self.root_dir, 'warcs', cdx['filename']) warcs = r.hgetall('USER:COLL:warc') assert len(warcs) == 2 assert warcs[cdx['filename'].encode('utf-8')] == fullwarc.encode( 'utf-8') with open(fullwarc, 'rb') as fh: decomp = DecompressingBufferedReader(fh) # Test refers-to headers status_headers = StatusAndHeadersParser(['WARC/1.0']).parse(decomp) assert status_headers.get_header('WARC-Type') == 'revisit' assert status_headers.get_header( 'WARC-Target-URI') == 'http://httpbin.org/user-agent' assert status_headers.get_header('WARC-Date') != '' assert status_headers.get_header( 'WARC-Refers-To-Target-URI') == 'http://httpbin.org/user-agent' assert status_headers.get_header('WARC-Refers-To-Date') != ''
def test_agg_local_revisit(self): resp = self.testapp.get( '/many/resource?url=http://www.example.com/&closest=20140127171251&sources=local' ) assert resp.headers['Warcserver-Source-Coll'] == 'local:dupes.cdxj' buff = BytesIO(resp.body) status_headers = StatusAndHeadersParser(['WARC/1.0']).parse(buff) assert status_headers.get_header( 'WARC-Target-URI') == 'http://example.com' assert status_headers.get_header('WARC-Date') == '2014-01-27T17:12:51Z' assert status_headers.get_header( 'WARC-Refers-To-Target-URI') == 'http://example.com' assert status_headers.get_header( 'WARC-Refers-To-Date') == '2014-01-27T17:12:00Z' assert resp.headers['Link'] == MementoUtils.make_link( 'http://example.com', 'original') assert resp.headers[ 'Memento-Datetime'] == 'Mon, 27 Jan 2014 17:12:51 GMT' assert b'HTTP/1.1 200 OK' in resp.body assert b'<!doctype html>' in resp.body assert 'ResErrors' not in resp.headers
def parse(self, stream, headerline=None): total_read = 0 if headerline is None: headerline = stream.readline() headerline = StatusAndHeadersParser.decode_header(headerline) header_len = len(headerline) if header_len == 0: raise EOFError() headerline = headerline.rstrip() headernames = self.headernames # if arc header, consume next two lines if headerline.startswith('filedesc://'): version = StatusAndHeadersParser.decode_header( stream.readline()) # skip version spec = StatusAndHeadersParser.decode_header( stream.readline()) # skip header spec, use preset one total_read += len(version) total_read += len(spec) parts = headerline.split(' ') if len(parts) != len(headernames): msg = 'Wrong # of headers, expected arc headers {0}, Found {1}' msg = msg.format(headernames, parts) raise StatusAndHeadersParserException(msg, parts) protocol, headers = self._get_protocol_and_headers(headerline, parts) return StatusAndHeaders(statusline='', headers=headers, protocol='WARC/1.0', total_len=total_read)
def __init__(self, verify_http=True, arc2warc=True): if arc2warc: self.arc_parser = ARC2WARCHeadersParser() else: self.arc_parser = ARCHeadersParser() self.warc_parser = StatusAndHeadersParser(self.WARC_TYPES) self.http_parser = StatusAndHeadersParser(self.HTTP_TYPES, verify_http) self.http_req_parser = StatusAndHeadersParser(self.HTTP_VERBS, verify_http)
def test_record_param_user_coll_revisit(self): warc_path = to_path(self.root_dir + '/warcs/{user}/{coll}/') dedup_index = self._get_dedup_index() recorder_app = RecorderApp(self.upstream_url, PerRecordWARCWriter(warc_path, dedup_index=dedup_index)) self._test_all_warcs('/warcs/USER/COLL/', 1) resp = self._test_warc_write(recorder_app, 'httpbin.org', '/user-agent', '¶m.recorder.user=USER¶m.recorder.coll=COLL') assert '"user-agent": "{0}"'.format(UA) in resp.text #assert b'HTTP/1.1 200 OK' in resp.body #assert b'"foo": "bar"' in resp.body self._test_all_warcs('/warcs/USER/COLL/', 2) # Test Redis CDX r = FakeStrictRedis.from_url('redis://localhost/2') res = r.zrangebylex('USER:COLL:cdxj', '[org,httpbin)/', '(org,httpbin,') assert len(res) == 2 if b'warc/revisit' in res[0]: cdx = CDXObject(res[0]) else: cdx = CDXObject(res[1]) assert cdx['urlkey'] == 'org,httpbin)/user-agent' assert cdx['mime'] == 'warc/revisit' assert cdx['offset'] == '0' assert cdx['filename'].startswith(to_path('USER/COLL/')) assert cdx['filename'].endswith('.warc.gz') fullwarc = os.path.join(self.root_dir, 'warcs', cdx['filename']) warcs = r.hgetall('USER:COLL:warc') assert len(warcs) == 2 assert warcs[cdx['filename'].encode('utf-8')] == fullwarc.encode('utf-8') with open(fullwarc, 'rb') as fh: decomp = DecompressingBufferedReader(fh) # Test refers-to headers status_headers = StatusAndHeadersParser(['WARC/1.0']).parse(decomp) assert status_headers.get_header('WARC-Type') == 'revisit' assert status_headers.get_header('WARC-Target-URI') == 'http://httpbin.org/user-agent' assert status_headers.get_header('WARC-Date') != '' assert status_headers.get_header('WARC-Refers-To-Target-URI') == 'http://httpbin.org/user-agent' assert status_headers.get_header('WARC-Refers-To-Date') != ''
def _get_protocol_and_headers(self, headerline, parts): headers = [] if headerline.startswith('filedesc://'): rec_type = 'warcinfo' else: rec_type = 'response' parts[3] = 'application/http;msgtype=response' headers.append(('WARC-Type', rec_type)) headers.append(('WARC-Record-ID', StatusAndHeadersParser.make_warc_id())) for name, value in zip(self.headernames, parts): if name == 'WARC-Date': value = timestamp_to_iso_date(value) if rec_type == 'warcinfo' and name == 'WARC-Target-URI': name = 'WARC-Filename' value = value[len('filedesc://'):] headers.append((name, value)) return ('WARC/1.0', headers)
def test_status_one_word(): res = StatusAndHeadersParser(['GET'], verify=False).parse(StringIO('A')) assert (str(res) == 'A\r\n')
def test_status_empty(): with pytest.raises(EOFError): StatusAndHeadersParser([], verify=False).parse(StringIO(''))
class ArcWarcRecordLoader(object): WARC_TYPES = ['WARC/1.1', 'WARC/1.0', 'WARC/0.17', 'WARC/0.18'] HTTP_TYPES = ['HTTP/1.0', 'HTTP/1.1'] HTTP_VERBS = [ 'GET', 'HEAD', 'POST', 'PUT', 'DELETE', 'TRACE', 'OPTIONS', 'CONNECT', 'PATCH' ] HTTP_RECORDS = ('response', 'request', 'revisit') NON_HTTP_SCHEMES = ('dns:', 'whois:', 'ntp:') HTTP_SCHEMES = ('http:', 'https:') def __init__(self, verify_http=True, arc2warc=True): if arc2warc: self.arc_parser = ARC2WARCHeadersParser() else: self.arc_parser = ARCHeadersParser() self.warc_parser = StatusAndHeadersParser(self.WARC_TYPES) self.http_parser = StatusAndHeadersParser(self.HTTP_TYPES, verify_http) self.http_req_parser = StatusAndHeadersParser(self.HTTP_VERBS, verify_http) def parse_record_stream(self, stream, statusline=None, known_format=None, no_record_parse=False, ensure_http_headers=False): """ Parse file-like stream and return an ArcWarcRecord encapsulating the record headers, http headers (if any), and a stream limited to the remainder of the record. Pass statusline and known_format to detect_type_loader_headers() to faciliate parsing. """ (the_format, rec_headers) = (self._detect_type_load_headers( stream, statusline, known_format)) if the_format == 'arc': uri = rec_headers.get_header('uri') length = rec_headers.get_header('length') content_type = rec_headers.get_header('content-type') sub_len = rec_headers.total_len if uri and uri.startswith('filedesc://'): rec_type = 'arc_header' else: rec_type = 'response' elif the_format in ('warc', 'arc2warc'): rec_type = rec_headers.get_header('WARC-Type') uri = self._ensure_target_uri_format(rec_headers) length = rec_headers.get_header('Content-Length') content_type = rec_headers.get_header('Content-Type') if the_format == 'warc': sub_len = 0 else: sub_len = rec_headers.total_len the_format = 'warc' is_err = False try: if length is not None: length = int(length) - sub_len if length < 0: is_err = True except (ValueError, TypeError): is_err = True # err condition if is_err: length = 0 # limit stream to the length for all valid records if length is not None and length >= 0: stream = LimitReader.wrap_stream(stream, length) http_headers = None # load http headers if parsing if not no_record_parse: http_headers = self.load_http_headers(rec_type, uri, stream, length) # generate validate http headers (eg. for replay) if not http_headers and ensure_http_headers: http_headers = self.default_http_headers(length, content_type) return ArcWarcRecord(the_format, rec_type, rec_headers, stream, http_headers, content_type, length) def load_http_headers(self, rec_type, uri, stream, length): # only if length == 0 don't parse # try parsing is length is unknown (length is None) or length > 0 if length == 0: return None # only certain record types can have http headers if rec_type not in self.HTTP_RECORDS: return None # only http:/https: uris can have http headers if not uri.startswith(self.HTTP_SCHEMES): return None # request record: parse request if rec_type == 'request': return self.http_req_parser.parse(stream) elif rec_type == 'revisit': try: return self.http_parser.parse(stream) except EOFError: # empty revisit with no http headers, is ok! return None # response record or non-empty revisit: parse HTTP status and headers! else: return self.http_parser.parse(stream) def default_http_headers(self, length, content_type=None): headers = [] if content_type: headers.append(('Content-Type', content_type)) if length is not None and length >= 0: headers.append(('Content-Length', str(length))) return StatusAndHeaders('200 OK', headers=headers, protocol='HTTP/1.0') def _detect_type_load_headers(self, stream, statusline=None, known_format=None): """ If known_format is specified ('warc' or 'arc'), parse only as that format. Otherwise, try parsing record as WARC, then try parsing as ARC. if neither one succeeds, we're out of luck. """ if known_format != 'arc': # try as warc first try: rec_headers = self.warc_parser.parse(stream, statusline) return 'warc', rec_headers except StatusAndHeadersParserException as se: if known_format == 'warc': msg = 'Invalid WARC record, first line: ' raise ArchiveLoadFailed(msg + str(se.statusline)) statusline = se.statusline pass # now try as arc try: rec_headers = self.arc_parser.parse(stream, statusline) return self.arc_parser.get_rec_type(), rec_headers except StatusAndHeadersParserException as se: if known_format == 'arc': msg = 'Invalid ARC record, first line: ' else: msg = 'Unknown archive format, first line: ' raise ArchiveLoadFailed(msg + str(se.statusline)) def _ensure_target_uri_format(self, rec_headers): """Checks the value for the WARC-Target-URI header field to see if it starts with '<' and ends with '>' (Wget 1.19 bug) and if '<' and '>' are present, corrects and updates the field returning the corrected value for the field otherwise just returns the fields value. :param StatusAndHeaders rec_headers: The parsed WARC headers :return: The value for the WARC-Target-URI field :rtype: str | None """ uri = rec_headers.get_header('WARC-Target-URI') if uri is not None and uri.startswith('<') and uri.endswith('>'): uri = uri[1:-1] rec_headers.replace_header('WARC-Target-URI', uri) # BEGIN PERMA CUSTOMIZATION # https://github.com/webrecorder/warcio/blob/c64c4394805e13256695f51af072c95389397ee9/warcio/recordloader.py#L217 # https://github.com/webrecorder/warcio/pull/80 # don't pass WARC-Target-URI with spaces to the cdxline indexers, which don't expect that # cause of at least some of the errors in https://github.com/harvard-lil/perma/issues/2605 if uri is not None and " " in uri: logger.warning( "Replacing spaces in invalid WARC-Target-URI: {}".format(uri)) uri = uri.replace(" ", "%20") rec_headers.replace_header('WARC-Target-URI', uri) # END PERMA CUSTOMIZATION return uri
def __init__(self, env): self.env = env parser = StatusAndHeadersParser([], verify=False) self.status_headers = parser.parse(self.env['wsgi.input'])
class WARCPathLoader(DefaultResolverMixin, BaseLoader): def __init__(self, paths, cdx_source): self.paths = paths self.resolvers = self.make_resolvers(self.paths) self.resolve_loader = ResolvingLoader(self.resolvers, no_record_parse=True) self.headers_parser = StatusAndHeadersParser([], verify=False) self.cdx_source = cdx_source def load_resource(self, cdx, params): if cdx.get('_cached_result'): return cdx.get('_cached_result') if not cdx.get('filename') or cdx.get('offset') is None: return None orig_source = cdx.get('source', '').split(':')[0] formatter = ParamFormatter(params, orig_source) cdx._formatter = formatter def local_index_query(local_params): for n, v in six.iteritems(params): if n.startswith('param.'): local_params[n] = v cdx_iter, errs = self.cdx_source(local_params) for cdx in cdx_iter: cdx._formatter = formatter yield cdx failed_files = [] headers, payload = (self.resolve_loader.load_headers_and_payload( cdx, failed_files, local_index_query)) http_headers_buff = None if payload.rec_type in ('response', 'revisit'): status = cdx.get('status') # if status is not set and not, 2xx, 4xx, 5xx # go through self-redirect check just in case if not status or not status.startswith(('2', '4', '5')): http_headers = self.headers_parser.parse(payload.raw_stream) try: orig_size = payload.raw_stream.tell() except: orig_size = 0 try: self.raise_on_self_redirect( params, cdx, http_headers.get_statuscode(), http_headers.get_header('Location')) except LiveResourceException: no_except_close(headers.raw_stream) no_except_close(payload.raw_stream) raise http_headers_buff = http_headers.to_bytes() # if new http_headers_buff is different length, # attempt to adjust content-length on the WARC record if orig_size and len(http_headers_buff) != orig_size: orig_cl = payload.rec_headers.get_header('Content-Length') if orig_cl: new_cl = int(orig_cl) + (len(http_headers_buff) - orig_size) payload.rec_headers.replace_header( 'Content-Length', str(new_cl)) warc_headers = payload.rec_headers if headers != payload: warc_headers.replace_header( 'WARC-Refers-To-Target-URI', payload.rec_headers.get_header('WARC-Target-URI')) warc_headers.replace_header( 'WARC-Refers-To-Date', payload.rec_headers.get_header('WARC-Date')) warc_headers.replace_header( 'WARC-Target-URI', headers.rec_headers.get_header('WARC-Target-URI')) warc_headers.replace_header( 'WARC-Date', headers.rec_headers.get_header('WARC-Date')) no_except_close(headers.raw_stream) return (warc_headers, http_headers_buff, payload.raw_stream) def __str__(self): return 'WARCPathLoader'
def _make_warc_id(cls): return StatusAndHeadersParser.make_warc_id()
class BaseWARCWriter(object): WARC_RECORDS = { 'warcinfo': 'application/warc-fields', 'response': 'application/http; msgtype=response', 'revisit': 'application/http; msgtype=response', 'request': 'application/http; msgtype=request', 'metadata': 'application/warc-fields', } REVISIT_PROFILE = 'http://netpreserve.org/warc/1.0/revisit/identical-payload-digest' WARC_VERSION = 'WARC/1.0' def __init__(self, gzip=True, *args, **kwargs): self.gzip = gzip self.hostname = gethostname() self.parser = StatusAndHeadersParser([], verify=False) self.warc_version = kwargs.get('warc_version', self.WARC_VERSION) self.header_filter = kwargs.get('header_filter') @classmethod def _iter_stream(cls, stream): while True: buf = stream.read(BUFF_SIZE) if not buf: return yield buf def ensure_digest(self, record, block=True, payload=True): if block and record.rec_headers.get_header('WARC-Block-Digest'): block = False if payload and record.rec_headers.get_header('WARC-Payload-Digest'): payload = False block_digester = self._create_digester() if block else None payload_digester = self._create_digester() if payload else None if not block_digester and not payload_digester: return temp_file = None try: pos = record.raw_stream.tell() record.raw_stream.seek(pos) except: pos = 0 temp_file = self._create_temp_file() if block_digester and record.http_headers and record.http_headers.headers_buff: block_digester.update(record.http_headers.headers_buff) for buf in self._iter_stream(record.raw_stream): if block_digester: block_digester.update(buf) if payload_digester: payload_digester.update(buf) if temp_file: temp_file.write(buf) if temp_file: record.payload_length = temp_file.tell() temp_file.seek(0) record._orig_stream = record.raw_stream record.raw_stream = temp_file else: record.raw_stream.seek(pos) if block_digester: record.rec_headers.add_header('WARC-Block-Digest', str(block_digester)) if payload_digester: record.rec_headers.add_header('WARC-Payload-Digest', str(payload_digester)) def _create_digester(self): return Digester('sha1') def write_request_response_pair(self, req, resp, params=None): url = resp.rec_headers.get_header('WARC-Target-URI') dt = resp.rec_headers.get_header('WARC-Date') req.rec_headers.replace_header('WARC-Target-URI', url) req.rec_headers.replace_header('WARC-Date', dt) resp_id = resp.rec_headers.get_header('WARC-Record-ID') if resp_id: req.rec_headers.add_header('WARC-Concurrent-To', resp_id) self._do_write_req_resp(req, resp, params) def write_record(self, record, params=None): #pragma: no cover raise NotImplemented() def _do_write_req_resp(self, req, resp, params): #pragma: no cover raise NotImplemented() def create_warcinfo_record(self, filename, info): warc_headers = StatusAndHeaders(self.warc_version, []) warc_headers.add_header('WARC-Type', 'warcinfo') warc_headers.add_header('WARC-Record-ID', self._make_warc_id()) if filename: warc_headers.add_header('WARC-Filename', filename) warc_headers.add_header('WARC-Date', self._make_warc_date()) warcinfo = BytesIO() for name, value in six.iteritems(info): if not value: continue line = name + ': ' + str(value) + '\r\n' warcinfo.write(line.encode('latin-1')) length = warcinfo.tell() warcinfo.seek(0) return self.create_warc_record('', 'warcinfo', warc_headers=warc_headers, payload=warcinfo, length=length) def create_revisit_record(self, uri, digest, refers_to_uri, refers_to_date, http_headers=None): record = self.create_warc_record(uri, 'revisit', http_headers=http_headers) record.rec_headers.add_header('WARC-Profile', self.REVISIT_PROFILE) record.rec_headers.add_header('WARC-Refers-To-Target-URI', refers_to_uri) record.rec_headers.add_header('WARC-Refers-To-Date', refers_to_date) record.rec_headers.add_header('WARC-Payload-Digest', digest) return record def create_record_from_stream(self, record_stream, length): warc_headers = self.parser.parse(record_stream) return self.create_warc_record('', warc_headers.get_header('WARC-Type'), payload=record_stream, length=length, warc_headers=warc_headers) def create_warc_record(self, uri, record_type, payload=None, length=0, warc_content_type='', warc_headers_dict={}, warc_headers=None, http_headers=None): if payload and not http_headers and record_type in ('response', 'request', 'revisit'): http_headers = self.parser.parse(payload) length -= payload.tell() if not payload: payload = BytesIO() length = 0 if not warc_headers: warc_headers = self._init_warc_headers(uri, record_type, warc_headers_dict) # compute Content-Type if not warc_content_type: warc_content_type = warc_headers.get_header('Content-Type') if not warc_content_type: warc_content_type = self.WARC_RECORDS.get(record_type) record = ArcWarcRecord('warc', record_type, warc_headers, payload, http_headers, warc_content_type, length) record.payload_length = length if record_type not in ('warcinfo', 'revisit'): self.ensure_digest(record, block=False, payload=True) return record def _init_warc_headers(self, uri, record_type, warc_headers_dict): warc_headers = StatusAndHeaders(self.warc_version, list(warc_headers_dict.items())) warc_headers.replace_header('WARC-Type', record_type) if not warc_headers.get_header('WARC-Record-ID'): warc_headers.add_header('WARC-Record-ID', self._make_warc_id()) if uri: warc_headers.replace_header('WARC-Target-URI', uri) if not warc_headers.get_header('WARC-Date'): warc_headers.add_header('WARC-Date', self._make_warc_date()) return warc_headers def _set_header_buff(self, record): headers_buff = record.http_headers.to_bytes(self.header_filter) record.http_headers.headers_buff = headers_buff def _write_warc_record(self, out, record, adjust_cl=True): if self.gzip: out = GzippingWrapper(out) if record.http_headers: self._set_header_buff(record) # ensure digests are set if record.rec_type != 'warcinfo': self.ensure_digest(record, block=True, payload=False) # ensure proper content type record.rec_headers.replace_header('Content-Type', record.content_type) if record.rec_type == 'revisit': http_headers_only = True else: http_headers_only = False # compute Content-Length if record.http_headers and record.payload_length >= 0: actual_len = 0 if record.http_headers: actual_len = len(record.http_headers.headers_buff) if not http_headers_only: actual_len += record.payload_length record.length = actual_len record.rec_headers.replace_header('Content-Length', str(record.length)) # write record headers out.write(record.rec_headers.to_bytes()) # write headers buffer, if any if record.http_headers: out.write(record.http_headers.headers_buff) if not http_headers_only: try: for buf in self._iter_stream(record.raw_stream): out.write(buf) finally: if hasattr(record, '_orig_stream'): record.raw_stream.close() record.raw_stream = record._orig_stream # add two lines out.write(b'\r\n\r\n') out.flush() @classmethod def _make_warc_id(cls): return StatusAndHeadersParser.make_warc_id() @classmethod def _make_warc_date(cls): return datetime_to_iso_date(datetime.datetime.utcnow()) @classmethod def _create_temp_file(cls): return tempfile.SpooledTemporaryFile(max_size=512 * 1024)
class ArcWarcRecordLoader(object): WARC_TYPES = ['WARC/1.1', 'WARC/1.0', 'WARC/0.17', 'WARC/0.18'] HTTP_TYPES = ['HTTP/1.0', 'HTTP/1.1'] HTTP_VERBS = ['GET', 'HEAD', 'POST', 'PUT', 'DELETE', 'TRACE', 'OPTIONS', 'CONNECT', 'PATCH'] HTTP_RECORDS = ('response', 'request', 'revisit') NON_HTTP_SCHEMES = ('dns:', 'whois:', 'ntp:') HTTP_SCHEMES = ('http:', 'https:') def __init__(self, verify_http=True, arc2warc=True): if arc2warc: self.arc_parser = ARC2WARCHeadersParser() else: self.arc_parser = ARCHeadersParser() self.warc_parser = StatusAndHeadersParser(self.WARC_TYPES) self.http_parser = StatusAndHeadersParser(self.HTTP_TYPES, verify_http) self.http_req_parser = StatusAndHeadersParser(self.HTTP_VERBS, verify_http) def parse_record_stream(self, stream, statusline=None, known_format=None, no_record_parse=False, ensure_http_headers=False, check_digests=False): """ Parse file-like stream and return an ArcWarcRecord encapsulating the record headers, http headers (if any), and a stream limited to the remainder of the record. Pass statusline and known_format to detect_type_loader_headers() to faciliate parsing. """ (the_format, rec_headers) = (self. _detect_type_load_headers(stream, statusline, known_format)) if the_format == 'arc': uri = rec_headers.get_header('uri') length = rec_headers.get_header('length') content_type = rec_headers.get_header('content-type') sub_len = rec_headers.total_len if uri and uri.startswith('filedesc://'): rec_type = 'arc_header' else: rec_type = 'response' elif the_format in ('warc', 'arc2warc'): rec_type = rec_headers.get_header('WARC-Type') uri = self._ensure_target_uri_format(rec_headers) length = rec_headers.get_header('Content-Length') content_type = rec_headers.get_header('Content-Type') if the_format == 'warc': sub_len = 0 else: sub_len = rec_headers.total_len the_format = 'warc' is_err = False try: if length is not None: length = int(length) - sub_len if length < 0: is_err = True except (ValueError, TypeError): is_err = True # err condition if is_err: length = 0 is_verifying = False digest_checker = DigestChecker(check_digests) # limit stream to the length for all valid records if length is not None and length >= 0: stream = LimitReader.wrap_stream(stream, length) if check_digests: stream, is_verifying = self.wrap_digest_verifying_stream(stream, rec_type, rec_headers, digest_checker, length=length) http_headers = None payload_length = -1 # load http headers if parsing if not no_record_parse: start = stream.tell() http_headers = self.load_http_headers(rec_type, uri, stream, length) if length and http_headers: payload_length = length - (stream.tell() - start) # generate validate http headers (eg. for replay) if not http_headers and ensure_http_headers: http_headers = self.default_http_headers(length, content_type) if is_verifying: stream.begin_payload() return ArcWarcRecord(the_format, rec_type, rec_headers, stream, http_headers, content_type, length, payload_length=payload_length, digest_checker=digest_checker) def wrap_digest_verifying_stream(self, stream, rec_type, rec_headers, digest_checker, length=None): payload_digest = rec_headers.get_header('WARC-Payload-Digest') block_digest = rec_headers.get_header('WARC-Block-Digest') segment_number = rec_headers.get_header('WARC-Segment-Number') if not payload_digest and not block_digest: return stream, False stream = DigestVerifyingReader(stream, length, digest_checker, record_type=rec_type, payload_digest=payload_digest, block_digest=block_digest, segment_number=segment_number) return stream, True def load_http_headers(self, rec_type, uri, stream, length): # only if length == 0 don't parse # try parsing is length is unknown (length is None) or length > 0 if length == 0: return None # only certain record types can have http headers if rec_type not in self.HTTP_RECORDS: return None # only http:/https: uris can have http headers if not uri.startswith(self.HTTP_SCHEMES): return None # request record: parse request if rec_type == 'request': return self.http_req_parser.parse(stream) elif rec_type == 'revisit': try: return self.http_parser.parse(stream) except EOFError: # empty revisit with no http headers, is ok! return None # response record or non-empty revisit: parse HTTP status and headers! else: return self.http_parser.parse(stream) def default_http_headers(self, length, content_type=None): headers = [] if content_type: headers.append(('Content-Type', content_type)) if length is not None and length >= 0: headers.append(('Content-Length', str(length))) return StatusAndHeaders('200 OK', headers=headers, protocol='HTTP/1.0') def _detect_type_load_headers(self, stream, statusline=None, known_format=None): """ If known_format is specified ('warc' or 'arc'), parse only as that format. Otherwise, try parsing record as WARC, then try parsing as ARC. if neither one succeeds, we're out of luck. """ if known_format != 'arc': # try as warc first try: rec_headers = self.warc_parser.parse(stream, statusline) return 'warc', rec_headers except StatusAndHeadersParserException as se: if known_format == 'warc': msg = 'Invalid WARC record, first line: ' raise ArchiveLoadFailed(msg + str(se.statusline)) statusline = se.statusline pass # now try as arc try: rec_headers = self.arc_parser.parse(stream, statusline) return self.arc_parser.get_rec_type(), rec_headers except StatusAndHeadersParserException as se: if known_format == 'arc': msg = 'Invalid ARC record, first line: ' else: msg = 'Unknown archive format, first line: ' raise ArchiveLoadFailed(msg + str(se.statusline)) def _ensure_target_uri_format(self, rec_headers): """Checks the value for the WARC-Target-URI header field to see if it starts with '<' and ends with '>' (Wget 1.19 bug) and if '<' and '>' are present, corrects and updates the field returning the corrected value for the field otherwise just returns the fields value. Also checks for the presence of spaces and percent-encodes them if present, for more reliable parsing downstream. :param StatusAndHeaders rec_headers: The parsed WARC headers :return: The value for the WARC-Target-URI field :rtype: str | None """ uri = rec_headers.get_header('WARC-Target-URI') if uri is not None and uri.startswith('<') and uri.endswith('>'): uri = uri[1:-1] rec_headers.replace_header('WARC-Target-URI', uri) if uri is not None and " " in uri: logger.warning("Replacing spaces in invalid WARC-Target-URI: {}".format(uri)) uri = uri.replace(" ", "%20") rec_headers.replace_header('WARC-Target-URI', uri) return uri
class WARCPathLoader(DefaultResolverMixin, BaseLoader): def __init__(self, paths, cdx_source): self.paths = paths self.resolvers = self.make_resolvers(self.paths) self.resolve_loader = ResolvingLoader(self.resolvers, no_record_parse=True) self.headers_parser = StatusAndHeadersParser([], verify=False) self.cdx_source = cdx_source def load_resource(self, cdx, params): if cdx.get('_cached_result'): return cdx.get('_cached_result') if not cdx.get('filename') or cdx.get('offset') is None: return None orig_source = cdx.get('source', '').split(':')[0] formatter = ParamFormatter(params, orig_source) cdx._formatter = formatter def local_index_query(local_params): for n, v in six.iteritems(params): if n.startswith('param.'): local_params[n] = v cdx_iter, errs = self.cdx_source(local_params) for cdx in cdx_iter: cdx._formatter = formatter yield cdx failed_files = [] headers, payload = (self.resolve_loader.load_headers_and_payload( cdx, failed_files, local_index_query)) http_headers_buff = None if payload.rec_type in ('response', 'revisit'): status = cdx.get('status') # status may not be set for 'revisit' if not status or status.startswith('3'): http_headers = self.headers_parser.parse(payload.raw_stream) try: self.raise_on_self_redirect( params, cdx, http_headers.get_statuscode(), http_headers.get_header('Location')) except LiveResourceException: headers.raw_stream.close() payload.raw_stream.close() raise http_headers_buff = http_headers.to_bytes() warc_headers = payload.rec_headers if headers != payload: warc_headers.replace_header( 'WARC-Refers-To-Target-URI', payload.rec_headers.get_header('WARC-Target-URI')) warc_headers.replace_header( 'WARC-Refers-To-Date', payload.rec_headers.get_header('WARC-Date')) warc_headers.replace_header( 'WARC-Target-URI', headers.rec_headers.get_header('WARC-Target-URI')) warc_headers.replace_header( 'WARC-Date', headers.rec_headers.get_header('WARC-Date')) headers.raw_stream.close() return (warc_headers, http_headers_buff, payload.raw_stream) def __str__(self): return 'WARCPathLoader'
class WARCPathLoader(DefaultResolverMixin, BaseLoader): def __init__(self, paths, cdx_source): self.paths = paths self.resolvers = self.make_resolvers(self.paths) self.resolve_loader = ResolvingLoader(self.resolvers, no_record_parse=True) self.headers_parser = StatusAndHeadersParser([], verify=False) self.cdx_source = cdx_source def load_resource(self, cdx, params): if cdx.get('_cached_result'): return cdx.get('_cached_result') if not cdx.get('filename') or cdx.get('offset') is None: return None orig_source = cdx.get('source', '').split(':')[0] formatter = ParamFormatter(params, orig_source) cdx._formatter = formatter def local_index_query(local_params): for n, v in six.iteritems(params): if n.startswith('param.'): local_params[n] = v cdx_iter, errs = self.cdx_source(local_params) for cdx in cdx_iter: cdx._formatter = formatter yield cdx failed_files = [] headers, payload = (self.resolve_loader. load_headers_and_payload(cdx, failed_files, local_index_query)) http_headers_buff = None if payload.rec_type in ('response', 'revisit'): status = cdx.get('status') # status may not be set for 'revisit' if not status or status.startswith('3'): http_headers = self.headers_parser.parse(payload.raw_stream) try: self.raise_on_self_redirect(params, cdx, http_headers.get_statuscode(), http_headers.get_header('Location')) except LiveResourceException: headers.raw_stream.close() payload.raw_stream.close() raise http_headers_buff = http_headers.to_bytes() warc_headers = payload.rec_headers if headers != payload: warc_headers.replace_header('WARC-Refers-To-Target-URI', payload.rec_headers.get_header('WARC-Target-URI')) warc_headers.replace_header('WARC-Refers-To-Date', payload.rec_headers.get_header('WARC-Date')) warc_headers.replace_header('WARC-Target-URI', headers.rec_headers.get_header('WARC-Target-URI')) warc_headers.replace_header('WARC-Date', headers.rec_headers.get_header('WARC-Date')) headers.raw_stream.close() return (warc_headers, http_headers_buff, payload.raw_stream) def __str__(self): return 'WARCPathLoader'
class ArcWarcRecordLoader(object): WARC_TYPES = ['WARC/1.0', 'WARC/0.17', 'WARC/0.18'] HTTP_TYPES = ['HTTP/1.0', 'HTTP/1.1'] HTTP_VERBS = [ 'GET', 'HEAD', 'POST', 'PUT', 'DELETE', 'TRACE', 'OPTIONS', 'CONNECT', 'PATCH' ] NON_HTTP_RECORDS = ('warcinfo', 'arc_header', 'metadata', 'resource') NON_HTTP_SCHEMES = ('dns:', 'whois:', 'ntp:') HTTP_SCHEMES = ('http:', 'https:') def __init__(self, verify_http=True, arc2warc=True): if arc2warc: self.arc_parser = ARC2WARCHeadersParser() else: self.arc_parser = ARCHeadersParser() self.warc_parser = StatusAndHeadersParser(self.WARC_TYPES) self.http_parser = StatusAndHeadersParser(self.HTTP_TYPES, verify_http) self.http_req_parser = StatusAndHeadersParser(self.HTTP_VERBS, verify_http) def parse_record_stream(self, stream, statusline=None, known_format=None, no_record_parse=False): """ Parse file-like stream and return an ArcWarcRecord encapsulating the record headers, http headers (if any), and a stream limited to the remainder of the record. Pass statusline and known_format to detect_type_loader_headers() to faciliate parsing. """ (the_format, rec_headers) = (self._detect_type_load_headers( stream, statusline, known_format)) if the_format == 'arc': uri = rec_headers.get_header('uri') length = rec_headers.get_header('length') content_type = rec_headers.get_header('content-type') sub_len = rec_headers.total_len if uri and uri.startswith('filedesc://'): rec_type = 'arc_header' else: rec_type = 'response' elif the_format in ('warc', 'arc2warc'): rec_type = rec_headers.get_header('WARC-Type') uri = rec_headers.get_header('WARC-Target-URI') length = rec_headers.get_header('Content-Length') content_type = rec_headers.get_header('Content-Type') if the_format == 'warc': sub_len = 0 else: sub_len = rec_headers.total_len the_format = 'warc' is_err = False try: if length is not None: length = int(length) - sub_len if length < 0: is_err = True except (ValueError, TypeError): is_err = True # err condition if is_err: length = 0 # limit stream to the length for all valid records if length is not None and length >= 0: stream = LimitReader.wrap_stream(stream, length) # don't parse the http record at all if no_record_parse: http_headers = None #StatusAndHeaders('', []) # if empty record (error or otherwise) set status to 204 elif length == 0: #if is_err: # msg = '204 Possible Error' #else: # msg = '204 No Content' http_headers = StatusAndHeaders('', []) # response record or non-empty revisit: parse HTTP status and headers! elif (rec_type in ('response', 'revisit') and uri.startswith(self.HTTP_SCHEMES)): http_headers = self.http_parser.parse(stream) # request record: parse request elif ((rec_type == 'request') and uri.startswith(self.HTTP_SCHEMES)): http_headers = self.http_req_parser.parse(stream) # everything else: create a no-status entry, set content-type else: content_type_header = [('Content-Type', content_type)] if length is not None and length >= 0: content_type_header.append(('Content-Length', str(length))) http_headers = StatusAndHeaders('200 OK', content_type_header) return ArcWarcRecord(the_format, rec_type, rec_headers, stream, http_headers, content_type, length) def _detect_type_load_headers(self, stream, statusline=None, known_format=None): """ If known_format is specified ('warc' or 'arc'), parse only as that format. Otherwise, try parsing record as WARC, then try parsing as ARC. if neither one succeeds, we're out of luck. """ if known_format != 'arc': # try as warc first try: rec_headers = self.warc_parser.parse(stream, statusline) return 'warc', rec_headers except StatusAndHeadersParserException as se: if known_format == 'warc': msg = 'Invalid WARC record, first line: ' raise ArchiveLoadFailed(msg + str(se.statusline)) statusline = se.statusline pass # now try as arc try: rec_headers = self.arc_parser.parse(stream, statusline) return self.arc_parser.get_rec_type(), rec_headers except StatusAndHeadersParserException as se: if known_format == 'arc': msg = 'Invalid ARC record, first line: ' else: msg = 'Unknown archive format, first line: ' raise ArchiveLoadFailed(msg + str(se.statusline))
class ArcWarcRecordLoader(object): WARC_TYPES = ['WARC/1.0', 'WARC/0.17', 'WARC/0.18'] HTTP_TYPES = ['HTTP/1.0', 'HTTP/1.1'] HTTP_VERBS = [ 'GET', 'HEAD', 'POST', 'PUT', 'DELETE', 'TRACE', 'OPTIONS', 'CONNECT', 'PATCH' ] HTTP_RECORDS = ('response', 'request', 'revisit') NON_HTTP_SCHEMES = ('dns:', 'whois:', 'ntp:') HTTP_SCHEMES = ('http:', 'https:') def __init__(self, verify_http=True, arc2warc=True): if arc2warc: self.arc_parser = ARC2WARCHeadersParser() else: self.arc_parser = ARCHeadersParser() self.warc_parser = StatusAndHeadersParser(self.WARC_TYPES) self.http_parser = StatusAndHeadersParser(self.HTTP_TYPES, verify_http) self.http_req_parser = StatusAndHeadersParser(self.HTTP_VERBS, verify_http) def parse_record_stream(self, stream, statusline=None, known_format=None, no_record_parse=False, ensure_http_headers=False): """ Parse file-like stream and return an ArcWarcRecord encapsulating the record headers, http headers (if any), and a stream limited to the remainder of the record. Pass statusline and known_format to detect_type_loader_headers() to faciliate parsing. """ (the_format, rec_headers) = (self._detect_type_load_headers( stream, statusline, known_format)) if the_format == 'arc': uri = rec_headers.get_header('uri') length = rec_headers.get_header('length') content_type = rec_headers.get_header('content-type') sub_len = rec_headers.total_len if uri and uri.startswith('filedesc://'): rec_type = 'arc_header' else: rec_type = 'response' elif the_format in ('warc', 'arc2warc'): rec_type = rec_headers.get_header('WARC-Type') uri = rec_headers.get_header('WARC-Target-URI') length = rec_headers.get_header('Content-Length') content_type = rec_headers.get_header('Content-Type') if the_format == 'warc': sub_len = 0 else: sub_len = rec_headers.total_len the_format = 'warc' is_err = False try: if length is not None: length = int(length) - sub_len if length < 0: is_err = True except (ValueError, TypeError): is_err = True # err condition if is_err: length = 0 # limit stream to the length for all valid records if length is not None and length >= 0: stream = LimitReader.wrap_stream(stream, length) http_headers = None # load http headers if parsing if not no_record_parse: http_headers = self.load_http_headers(rec_type, uri, stream, length) # generate validate http headers (eg. for replay) if not http_headers and ensure_http_headers: http_headers = self.default_http_headers(length, content_type) return ArcWarcRecord(the_format, rec_type, rec_headers, stream, http_headers, content_type, length) def load_http_headers(self, rec_type, uri, stream, length): # only if length == 0 don't parse # try parsing is length is unknown (length is None) or length > 0 if length == 0: return None # only certain record types can have http headers if rec_type not in self.HTTP_RECORDS: return None # only http:/https: uris can have http headers if not uri.startswith(self.HTTP_SCHEMES): return None # request record: parse request if rec_type == 'request': return self.http_req_parser.parse(stream) elif rec_type == 'revisit': try: return self.http_parser.parse(stream) except EOFError: # empty revisit with no http headers, is ok! return None # response record or non-empty revisit: parse HTTP status and headers! else: return self.http_parser.parse(stream) def default_http_headers(self, length, content_type=None): headers = [] if content_type: headers.append(('Content-Type', content_type)) if length is not None and length >= 0: headers.append(('Content-Length', str(length))) return StatusAndHeaders('200 OK', headers=headers, protocol='HTTP/1.0') def _detect_type_load_headers(self, stream, statusline=None, known_format=None): """ If known_format is specified ('warc' or 'arc'), parse only as that format. Otherwise, try parsing record as WARC, then try parsing as ARC. if neither one succeeds, we're out of luck. """ if known_format != 'arc': # try as warc first try: rec_headers = self.warc_parser.parse(stream, statusline) return 'warc', rec_headers except StatusAndHeadersParserException as se: if known_format == 'warc': msg = 'Invalid WARC record, first line: ' raise ArchiveLoadFailed(msg + str(se.statusline)) statusline = se.statusline pass # now try as arc try: rec_headers = self.arc_parser.parse(stream, statusline) return self.arc_parser.get_rec_type(), rec_headers except StatusAndHeadersParserException as se: if known_format == 'arc': msg = 'Invalid ARC record, first line: ' else: msg = 'Unknown archive format, first line: ' raise ArchiveLoadFailed(msg + str(se.statusline))