def __init__(self, paths, cdx_source): self.paths = paths self.resolvers = self.make_resolvers(self.paths) self.resolve_loader = ResolvingLoader(self.resolvers, no_record_parse=True) self.headers_parser = StatusAndHeadersParser([], verify=False) self.cdx_source = cdx_source
def load_from_cdx_test(cdx, revisit_func=load_orig_cdx, reraise=False, failed_files=None): resolve_loader = ResolvingLoader( DefaultResolverMixin.make_resolvers(test_warc_dir)) cdx = CDXObject(cdx.encode('utf-8')) try: (headers, stream) = resolve_loader(cdx, failed_files, revisit_func) print(repr_format(headers)) sys.stdout.write(stream.readline().decode('utf-8')) sys.stdout.write(stream.readline().decode('utf-8')) except ArchiveLoadFailed as e: if reraise: raise else: print('Exception: ' + e.__class__.__name__)
class WARCPathLoader(DefaultResolverMixin, BaseLoader): def __init__(self, paths, cdx_source): self.paths = paths self.resolvers = self.make_resolvers(self.paths) self.resolve_loader = ResolvingLoader(self.resolvers, no_record_parse=True) self.headers_parser = StatusAndHeadersParser([], verify=False) self.cdx_source = cdx_source def load_resource(self, cdx, params): if cdx.get('_cached_result'): return cdx.get('_cached_result') if not cdx.get('filename') or cdx.get('offset') is None: return None orig_source = cdx.get('source', '').split(':')[0] formatter = ParamFormatter(params, orig_source) cdx._formatter = formatter def local_index_query(local_params): for n, v in six.iteritems(params): if n.startswith('param.'): local_params[n] = v cdx_iter, errs = self.cdx_source(local_params) for cdx in cdx_iter: cdx._formatter = formatter yield cdx failed_files = [] headers, payload = (self.resolve_loader.load_headers_and_payload( cdx, failed_files, local_index_query)) http_headers_buff = None if payload.rec_type in ('response', 'revisit'): status = cdx.get('status') # status may not be set for 'revisit' if not status or status.startswith('3'): http_headers = self.headers_parser.parse(payload.raw_stream) try: self.raise_on_self_redirect( params, cdx, http_headers.get_statuscode(), http_headers.get_header('Location')) except LiveResourceException: headers.raw_stream.close() payload.raw_stream.close() raise http_headers_buff = http_headers.to_bytes() warc_headers = payload.rec_headers if headers != payload: warc_headers.replace_header( 'WARC-Refers-To-Target-URI', payload.rec_headers.get_header('WARC-Target-URI')) warc_headers.replace_header( 'WARC-Refers-To-Date', payload.rec_headers.get_header('WARC-Date')) warc_headers.replace_header( 'WARC-Target-URI', headers.rec_headers.get_header('WARC-Target-URI')) warc_headers.replace_header( 'WARC-Date', headers.rec_headers.get_header('WARC-Date')) headers.raw_stream.close() return (warc_headers, http_headers_buff, payload.raw_stream) def __str__(self): return 'WARCPathLoader'
class WARCPathLoader(DefaultResolverMixin, BaseLoader): def __init__(self, paths, cdx_source): self.paths = paths self.resolvers = self.make_resolvers(self.paths) self.resolve_loader = ResolvingLoader(self.resolvers, no_record_parse=True) self.headers_parser = StatusAndHeadersParser([], verify=False) self.cdx_source = cdx_source def load_resource(self, cdx, params): if cdx.get('_cached_result'): return cdx.get('_cached_result') if not cdx.get('filename') or cdx.get('offset') is None: return None orig_source = cdx.get('source', '').split(':')[0] formatter = ParamFormatter(params, orig_source) cdx._formatter = formatter def local_index_query(local_params): for n, v in six.iteritems(params): if n.startswith('param.'): local_params[n] = v cdx_iter, errs = self.cdx_source(local_params) for cdx in cdx_iter: cdx._formatter = formatter yield cdx failed_files = [] headers, payload = (self.resolve_loader.load_headers_and_payload( cdx, failed_files, local_index_query)) http_headers_buff = None if payload.rec_type in ('response', 'revisit'): status = cdx.get('status') # if status is not set and not, 2xx, 4xx, 5xx # go through self-redirect check just in case if not status or not status.startswith(('2', '4', '5')): http_headers = self.headers_parser.parse(payload.raw_stream) try: orig_size = payload.raw_stream.tell() except: orig_size = 0 try: self.raise_on_self_redirect( params, cdx, http_headers.get_statuscode(), http_headers.get_header('Location')) except LiveResourceException: no_except_close(headers.raw_stream) no_except_close(payload.raw_stream) raise http_headers_buff = http_headers.to_bytes() # if new http_headers_buff is different length, # attempt to adjust content-length on the WARC record if orig_size and len(http_headers_buff) != orig_size: orig_cl = payload.rec_headers.get_header('Content-Length') if orig_cl: new_cl = int(orig_cl) + (len(http_headers_buff) - orig_size) payload.rec_headers.replace_header( 'Content-Length', str(new_cl)) warc_headers = payload.rec_headers if headers != payload: warc_headers.replace_header( 'WARC-Refers-To-Target-URI', payload.rec_headers.get_header('WARC-Target-URI')) warc_headers.replace_header( 'WARC-Refers-To-Date', payload.rec_headers.get_header('WARC-Date')) warc_headers.replace_header( 'WARC-Target-URI', headers.rec_headers.get_header('WARC-Target-URI')) warc_headers.replace_header( 'WARC-Date', headers.rec_headers.get_header('WARC-Date')) no_except_close(headers.raw_stream) return (warc_headers, http_headers_buff, payload.raw_stream) def __str__(self): return 'WARCPathLoader'