示例#1
0
    def _writeResponse(self, item, concurrentTo):
        # fetch the body
        reqId = item.id

        # now the response
        resp = item.response
        warcHeaders = {
            'WARC-Concurrent-To': concurrentTo,
            'X-Chrome-Request-ID': item.id,
            'WARC-Date': datetime_to_iso_date(resp.timestamp),
        }
        # conditional WARC headers
        if item.remoteIpAddress:
            warcHeaders['WARC-IP-Address'] = item.remoteIpAddress
        if item.protocol:
            warcHeaders['X-Chrome-Protocol'] = item.protocol

        # HTTP headers
        statusText = resp.statusText or \
                BaseHTTPRequestHandler.responses.get (
                resp.status, ('No status text available', ))[0]
        httpHeaders = StatusAndHeaders(f'{resp.status} {statusText}',
                                       resp.headers,
                                       protocol='HTTP/1.1')

        # Content is saved decompressed and decoded, remove these headers
        blacklistedHeaders = {'transfer-encoding', 'content-encoding'}
        for h in blacklistedHeaders:
            httpHeaders.remove_header(h)

        # chrome sends nothing but utf8 encoded text. Fortunately HTTP
        # headers take precedence over the document’s <meta>, thus we can
        # easily override those.
        contentType = resp.mimeType
        if contentType:
            if isinstance(resp.body, UnicodeBody):
                contentType += '; charset=utf-8'
            httpHeaders.replace_header('Content-Type', contentType)

        # response body
        body = resp.body
        if body is None:
            warcHeaders['WARC-Truncated'] = 'unspecified'
        else:
            httpHeaders.replace_header('Content-Length', str(len(body)))
            warcHeaders['X-Chrome-Base64Body'] = str(type(body) is Base64Body)
            body = BytesIO(body)

        record = self.writeRecord(item.url,
                                  'response',
                                  warc_headers_dict=warcHeaders,
                                  payload=body,
                                  http_headers=httpHeaders)

        if item.resourceType == 'Document':
            self.documentRecords[item.url] = record.rec_headers.get_header(
                'WARC-Record-ID')
示例#2
0
    def _writeResponse(self, item, concurrentTo):
        # fetch the body
        reqId = item.id
        rawBody = None
        base64Encoded = False
        bodyTruncated = None
        if item.isRedirect:
            # redirects reuse the same request, thus we cannot safely retrieve
            # the body (i.e getResponseBody may return the new location’s
            # body).
            bodyTruncated = 'unspecified'
        elif item.encodedDataLength > self.maxBodySize:
            bodyTruncated = 'length'
            # check body size first, since we’re loading everything into memory
            self.logger.error('body for {} too large {} vs {}'.format(
                reqId, item.encodedDataLength, self.maxBodySize))
        else:
            try:
                rawBody, base64Encoded = item.body
            except ValueError:
                # oops, don’t know what went wrong here
                bodyTruncated = 'unspecified'

        # now the response
        resp = item.response
        warcHeaders = {
            'WARC-Concurrent-To':
            concurrentTo,
            'WARC-IP-Address':
            resp.get('remoteIPAddress', ''),
            'X-Chrome-Protocol':
            resp.get('protocol', ''),
            'X-Chrome-FromDiskCache':
            str(resp.get('fromDiskCache')),
            'X-Chrome-ConnectionReused':
            str(resp.get('connectionReused')),
            'X-Chrome-Request-ID':
            item.id,
            'WARC-Date':
            datetime_to_iso_date(
                datetime.utcfromtimestamp(item.chromeRequest['wallTime'] +
                                          (item.chromeResponse['timestamp'] -
                                           item.chromeRequest['timestamp']))),
        }
        if bodyTruncated:
            warcHeaders['WARC-Truncated'] = bodyTruncated
        else:
            warcHeaders['X-Chrome-Base64Body'] = str(base64Encoded)

        httpHeaders = StatusAndHeaders('{} {}'.format(resp['status'],
                                                      item.statusText),
                                       item.responseHeaders,
                                       protocol='HTTP/1.1')

        # Content is saved decompressed and decoded, remove these headers
        blacklistedHeaders = {'transfer-encoding', 'content-encoding'}
        for h in blacklistedHeaders:
            httpHeaders.remove_header(h)

        # chrome sends nothing but utf8 encoded text. Fortunately HTTP
        # headers take precedence over the document’s <meta>, thus we can
        # easily override those.
        contentType = resp.get('mimeType')
        if contentType:
            if not base64Encoded:
                contentType += '; charset=utf-8'
            httpHeaders.replace_header('content-type', contentType)

        if rawBody is not None:
            httpHeaders.replace_header('content-length',
                                       '{:d}'.format(len(rawBody)))
            bodyIo = BytesIO(rawBody)
        else:
            bodyIo = BytesIO()

        record = self.writeRecord(resp['url'],
                                  'response',
                                  warc_headers_dict=warcHeaders,
                                  payload=bodyIo,
                                  http_headers=httpHeaders)

        if item.resourceType == 'Document':
            self.documentRecords[item.url] = record.rec_headers.get_header(
                'WARC-Record-ID')