예제 #1
0
class WARCPathLoader(DefaultResolverMixin, BaseLoader):
    def __init__(self, paths, cdx_source):
        self.paths = paths

        self.resolvers = self.make_resolvers(self.paths)

        self.resolve_loader = ResolvingLoader(self.resolvers,
                                              no_record_parse=True)

        self.headers_parser = StatusAndHeadersParser([], verify=False)

        self.cdx_source = cdx_source

    def load_resource(self, cdx, params):
        if cdx.get('_cached_result'):
            return cdx.get('_cached_result')

        if not cdx.get('filename') or cdx.get('offset') is None:
            return None

        orig_source = cdx.get('source', '').split(':')[0]
        formatter = ParamFormatter(params, orig_source)
        cdx._formatter = formatter

        def local_index_query(local_params):
            for n, v in six.iteritems(params):
                if n.startswith('param.'):
                    local_params[n] = v

            cdx_iter, errs = self.cdx_source(local_params)
            for cdx in cdx_iter:
                cdx._formatter = formatter
                yield cdx

        failed_files = []
        headers, payload = (self.resolve_loader.load_headers_and_payload(
            cdx, failed_files, local_index_query))

        http_headers_buff = None
        if payload.rec_type in ('response', 'revisit'):
            status = cdx.get('status')
            # status may not be set for 'revisit'
            if not status or status.startswith('3'):
                http_headers = self.headers_parser.parse(payload.raw_stream)

                try:
                    self.raise_on_self_redirect(
                        params, cdx, http_headers.get_statuscode(),
                        http_headers.get_header('Location'))
                except LiveResourceException:
                    headers.raw_stream.close()
                    payload.raw_stream.close()
                    raise

                http_headers_buff = http_headers.to_bytes()

        warc_headers = payload.rec_headers

        if headers != payload:
            warc_headers.replace_header(
                'WARC-Refers-To-Target-URI',
                payload.rec_headers.get_header('WARC-Target-URI'))

            warc_headers.replace_header(
                'WARC-Refers-To-Date',
                payload.rec_headers.get_header('WARC-Date'))

            warc_headers.replace_header(
                'WARC-Target-URI',
                headers.rec_headers.get_header('WARC-Target-URI'))

            warc_headers.replace_header(
                'WARC-Date', headers.rec_headers.get_header('WARC-Date'))

            headers.raw_stream.close()

        return (warc_headers, http_headers_buff, payload.raw_stream)

    def __str__(self):
        return 'WARCPathLoader'
예제 #2
0
class WARCPathLoader(DefaultResolverMixin, BaseLoader):
    def __init__(self, paths, cdx_source):
        self.paths = paths

        self.resolvers = self.make_resolvers(self.paths)

        self.resolve_loader = ResolvingLoader(self.resolvers,
                                              no_record_parse=True)

        self.headers_parser = StatusAndHeadersParser([], verify=False)

        self.cdx_source = cdx_source

    def load_resource(self, cdx, params):
        if cdx.get('_cached_result'):
            return cdx.get('_cached_result')

        if not cdx.get('filename') or cdx.get('offset') is None:
            return None

        orig_source = cdx.get('source', '').split(':')[0]
        formatter = ParamFormatter(params, orig_source)
        cdx._formatter = formatter

        def local_index_query(local_params):
            for n, v in six.iteritems(params):
                if n.startswith('param.'):
                    local_params[n] = v

            cdx_iter, errs = self.cdx_source(local_params)
            for cdx in cdx_iter:
                cdx._formatter = formatter
                yield cdx

        failed_files = []
        headers, payload = (self.resolve_loader.load_headers_and_payload(
            cdx, failed_files, local_index_query))

        http_headers_buff = None
        if payload.rec_type in ('response', 'revisit'):
            status = cdx.get('status')

            # if status is not set and not, 2xx, 4xx, 5xx
            # go through self-redirect check just in case
            if not status or not status.startswith(('2', '4', '5')):
                http_headers = self.headers_parser.parse(payload.raw_stream)
                try:
                    orig_size = payload.raw_stream.tell()
                except:
                    orig_size = 0

                try:
                    self.raise_on_self_redirect(
                        params, cdx, http_headers.get_statuscode(),
                        http_headers.get_header('Location'))
                except LiveResourceException:
                    no_except_close(headers.raw_stream)
                    no_except_close(payload.raw_stream)
                    raise

                http_headers_buff = http_headers.to_bytes()

                # if new http_headers_buff is different length,
                # attempt to adjust content-length on the WARC record
                if orig_size and len(http_headers_buff) != orig_size:
                    orig_cl = payload.rec_headers.get_header('Content-Length')
                    if orig_cl:
                        new_cl = int(orig_cl) + (len(http_headers_buff) -
                                                 orig_size)
                        payload.rec_headers.replace_header(
                            'Content-Length', str(new_cl))

        warc_headers = payload.rec_headers

        if headers != payload:
            warc_headers.replace_header(
                'WARC-Refers-To-Target-URI',
                payload.rec_headers.get_header('WARC-Target-URI'))

            warc_headers.replace_header(
                'WARC-Refers-To-Date',
                payload.rec_headers.get_header('WARC-Date'))

            warc_headers.replace_header(
                'WARC-Target-URI',
                headers.rec_headers.get_header('WARC-Target-URI'))

            warc_headers.replace_header(
                'WARC-Date', headers.rec_headers.get_header('WARC-Date'))
            no_except_close(headers.raw_stream)

        return (warc_headers, http_headers_buff, payload.raw_stream)

    def __str__(self):
        return 'WARCPathLoader'