def _writeRequest(self, item): logger = self.logger.bind(reqId=item.id) req = item.request url = item.url path = url.relative().with_fragment(None) httpHeaders = StatusAndHeaders(f'{req.method} {path} HTTP/1.1', req.headers, protocol='HTTP/1.1', is_http_request=True) warcHeaders = { 'X-Chrome-Initiator': json.dumps(req.initiator), 'X-Chrome-Request-ID': item.id, 'WARC-Date': datetime_to_iso_date(req.timestamp), } body = item.request.body if item.request.hasPostData and body is None: # oops, don’t know what went wrong here logger.error('requestBody missing', uuid='ee9adc58-e723-4595-9feb-312a67ead6a0') warcHeaders['WARC-Truncated'] = 'unspecified' else: warcHeaders['X-Chrome-Base64Body'] = str(type(body) is Base64Body) body = BytesIO(body) record = self.writeRecord(url, 'request', payload=body, http_headers=httpHeaders, warc_headers_dict=warcHeaders) return record.rec_headers['WARC-Record-ID']
def load_resource(self, cdx, params): load_url = cdx.get('load_url') if not load_url: return None if params.get('content_type') != self.CONTENT_TYPE: return None if not self.ydl: return None info = self.ydl.extract_info(load_url) info_buff = json.dumps(info) info_buff = info_buff.encode('utf-8') warc_headers = {} schema, rest = load_url.split('://', 1) target_url = 'metadata://' + rest dt = timestamp_to_datetime(cdx['timestamp']) warc_headers['WARC-Type'] = 'metadata' warc_headers['WARC-Record-ID'] = self._make_warc_id() warc_headers['WARC-Target-URI'] = target_url warc_headers['WARC-Date'] = datetime_to_iso_date(dt) warc_headers['Content-Type'] = self.CONTENT_TYPE warc_headers['Content-Length'] = str(len(info_buff)) warc_headers = StatusAndHeaders('WARC/1.0', warc_headers.items()) return warc_headers, None, BytesIO(info_buff)
def _writeResponse(self, item, concurrentTo): # fetch the body reqId = item.id # now the response resp = item.response warcHeaders = { 'WARC-Concurrent-To': concurrentTo, 'X-Chrome-Request-ID': item.id, 'WARC-Date': datetime_to_iso_date(resp.timestamp), } # conditional WARC headers if item.remoteIpAddress: warcHeaders['WARC-IP-Address'] = item.remoteIpAddress if item.protocol: warcHeaders['X-Chrome-Protocol'] = item.protocol # HTTP headers statusText = resp.statusText or \ BaseHTTPRequestHandler.responses.get ( resp.status, ('No status text available', ))[0] httpHeaders = StatusAndHeaders(f'{resp.status} {statusText}', resp.headers, protocol='HTTP/1.1') # Content is saved decompressed and decoded, remove these headers blacklistedHeaders = {'transfer-encoding', 'content-encoding'} for h in blacklistedHeaders: httpHeaders.remove_header(h) # chrome sends nothing but utf8 encoded text. Fortunately HTTP # headers take precedence over the document’s <meta>, thus we can # easily override those. contentType = resp.mimeType if contentType: if isinstance(resp.body, UnicodeBody): contentType += '; charset=utf-8' httpHeaders.replace_header('Content-Type', contentType) # response body body = resp.body if body is None: warcHeaders['WARC-Truncated'] = 'unspecified' else: httpHeaders.replace_header('Content-Length', str(len(body))) warcHeaders['X-Chrome-Base64Body'] = str(type(body) is Base64Body) body = BytesIO(body) record = self.writeRecord(item.url, 'response', warc_headers_dict=warcHeaders, payload=body, http_headers=httpHeaders) if item.resourceType == 'Document': self.documentRecords[item.url] = record.rec_headers.get_header( 'WARC-Record-ID')
def _writeRequest(self, item): logger = self.logger.bind(reqId=item.id) req = item.request resp = item.response url = urlsplit(resp['url']) path = url.path if url.query: path += '?' + url.query httpHeaders = StatusAndHeaders('{} {} HTTP/1.1'.format( req['method'], path), item.requestHeaders, protocol='HTTP/1.1', is_http_request=True) initiator = item.initiator warcHeaders = { 'X-Chrome-Initiator': json.dumps(initiator), 'X-Chrome-Request-ID': item.id, 'WARC-Date': datetime_to_iso_date( datetime.utcfromtimestamp(item.chromeRequest['wallTime'])), } try: bodyTruncated = None payload, payloadBase64Encoded = item.requestBody except ValueError: # oops, don’t know what went wrong here bodyTruncated = 'unspecified' logger.error('requestBody missing', uuid='ee9adc58-e723-4595-9feb-312a67ead6a0') if bodyTruncated: warcHeaders['WARC-Truncated'] = bodyTruncated payload = None if payload: payload = BytesIO(payload) warcHeaders['X-Chrome-Base64Body'] = str(payloadBase64Encoded) record = self.writeRecord(req['url'], 'request', payload=payload, http_headers=httpHeaders, warc_headers_dict=warcHeaders) return record.rec_headers['WARC-Record-ID']
def write_warc_info(self, message): creator = message.get('From', '') url = message.get('Snapshot-Content-Location', '') title = message.get('Subject', url) try: actual_date = http_date_to_datetime(message['Date']) timestamp = datetime_to_timestamp(actual_date) except Exception: actual_date = '' timestamp = '' source = 'MHTML Snapshot for: ' + url software = 'mhtml2warc ' + str(__version__) metadata = {'title': source, 'type': 'recording', 'pages': [{'title': title, 'url': url, 'timestamp': timestamp}] } params = OrderedDict([('software', software), ('creator', creator), ('source', source), ('format', 'WARC File Format 1.0'), ('subject', title), ('json-metadata', json.dumps(metadata))]) record = self.writer.create_warcinfo_record(self.filename, params) if actual_date: actual_date = datetime_to_iso_date(actual_date) creation_date = record.rec_headers.get('WARC-Date') record.rec_headers.replace_header('WARC-Date', actual_date) record.rec_headers.replace_header('WARC-Creation-Date', creation_date) self.writer.write_record(record) return actual_date
def make_record(self, writer, file_info): if self.fixed_dt: warc_date = self.fixed_dt else: warc_date = datetime_to_iso_date(file_info.modified_dt) url = file_info.url source_uri = 'file://' + file_info.full_filename warc_headers = { 'WARC-Date': warc_date, 'WARC-Source-URI': source_uri, 'WARC-Created-Date': writer._make_warc_date() } warc_content_type = self._guess_type(file_info) warc_content_type += self._guess_charset(warc_content_type, file_info) with file_info.open() as fh: record = writer.create_warc_record( url, 'resource', payload=fh, length=file_info.size, warc_content_type=warc_content_type, warc_headers_dict=warc_headers) self.count += 1 writer.write_record(record) self.logger.debug('Writing "{0}" ({1}) @ "{2}" from "{3}"'.format( url, warc_content_type, warc_date, file_info.full_filename)) if url.lower().endswith(self.index_files): self.add_index_revisit(writer, record, url, warc_date, source_uri)
def make_record(self, writer, file_info, record_type='resource', extra_headers=None): # process inclue/exclude rules if self.include and self.exclude: if self.fnmatch_list(file_info.full_filename, self.include): pass elif self.fnmatch_list(file_info.full_filename, self.exclude): return False elif self.include and not self.exclude: if not self.fnmatch_list(file_info.full_filename, self.include): return False elif self.exclude and not self.include: if self.fnmatch_list(file_info.full_filename, self.exclude): return False # type and encoding if self.use_tika: file_info.tika_results = self.tika_parser.from_file( file_info.full_filename) if self.use_mapfile: file_info.mapfile_results = self._match_mapfile( file_info.full_filename) mime_type = self._guess_type(file_info) encoding = self._guess_charset(mime_type, file_info) warc_content_type = mime_type + encoding # target URL if self.use_mapfile and file_info.mapfile_results and 'URL' in file_info.mapfile_results: url = file_info.mapfile_results['URL'] else: url = file_info.url # timestamp if self.use_mapfile and file_info.mapfile_results and 'timestamp' in file_info.mapfile_results: warc_date = self._set_fixed_dt( file_info.mapfile_results['timestamp']) elif self.fixed_dt: warc_date = self.fixed_dt else: warc_date = datetime_to_iso_date(file_info.modified_dt) # source from local disk source_uri = 'file://' + file_info.full_filename # write WARC entry warc_headers = { 'WARC-Date': warc_date, 'WARC-Source-URI': source_uri, 'WARC-Creation-Date': writer._make_warc_date() } if extra_headers: warc_headers.update(extra_headers) with file_info.open() as fh: record = writer.create_warc_record( url, record_type, payload=fh, length=file_info.size, warc_content_type=warc_content_type, warc_headers_dict=warc_headers) self.count += 1 writer.write_record(record) self.logger.debug('Writing "{0}" ({1}) @ "{2}" from "{3}"'.format( url, warc_content_type, warc_date, file_info.full_filename)) self.write_logfile({ 'file': file_info.full_filename, 'Record-Type': record_type, 'URL': url, 'timestamp': warc_date, 'Content-Type': warc_content_type, 'mime': mime_type, 'charset': encoding[10:] # minus '; charset=' }) return url, record
def _make_warc_date(cls): return datetime_to_iso_date(datetime.datetime.utcnow())
def _make_warc_date(cls, use_micros=False): return datetime_to_iso_date(datetime.datetime.utcnow(), use_micros=use_micros)
def load_resource(self, cdx, params): load_url = cdx.get('load_url') if not load_url: return None if params.get('content_type') == VideoLoader.CONTENT_TYPE: return None if self.forward_proxy_prefix and not cdx.get('is_live'): load_url = self.forward_proxy_prefix + load_url input_req = params['_input_req'] req_headers = input_req.get_req_headers() dt = timestamp_to_datetime(cdx['timestamp']) if cdx.get('memento_url'): req_headers['Accept-Datetime'] = datetime_to_http_date(dt) method = input_req.get_req_method() data = input_req.get_req_body() p = PreparedRequest() try: p.prepare_url(load_url, None) except: raise LiveResourceException(load_url) p.prepare_headers(None) p.prepare_auth(None, load_url) auth = p.headers.get('Authorization') if auth: req_headers['Authorization'] = auth load_url = p.url # host is set to the actual host for live loading # ensure it is set to the load_url host if not cdx.get('is_live'): #req_headers.pop('Host', '') req_headers['Host'] = urlsplit(p.url).netloc referrer = cdx.get('set_referrer') if referrer: req_headers['Referer'] = referrer upstream_res = self._do_request_with_redir_check( method, load_url, data, req_headers, params, cdx) memento_dt = upstream_res.headers.get('Memento-Datetime') if memento_dt: dt = http_date_to_datetime(memento_dt) cdx['timestamp'] = datetime_to_timestamp(dt) elif cdx.get('memento_url'): # if 'memento_url' set and no Memento-Datetime header present # then its an error return None agg_type = upstream_res.headers.get('Warcserver-Type') if agg_type == 'warc': cdx['source'] = unquote( upstream_res.headers.get('Warcserver-Source-Coll')) return None, upstream_res.headers, upstream_res if upstream_res.version == 11: version = '1.1' else: version = '1.0' status = 'HTTP/{version} {status} {reason}\r\n' status = status.format(version=version, status=upstream_res.status, reason=upstream_res.reason) http_headers_buff = status orig_resp = upstream_res._original_response try: #pragma: no cover #PY 3 resp_headers = orig_resp.headers._headers for n, v in resp_headers: nl = n.lower() if nl in self.SKIP_HEADERS: continue if nl in self.UNREWRITE_HEADERS: v = self.unrewrite_header(cdx, v) http_headers_buff += n + ': ' + v + '\r\n' http_headers_buff += '\r\n' try: # http headers could be encoded as utf-8 (though non-standard) # first try utf-8 encoding http_headers_buff = http_headers_buff.encode('utf-8') except: # then, fall back to latin-1 http_headers_buff = http_headers_buff.encode('latin-1') except: #pragma: no cover #PY 2 resp_headers = orig_resp.msg.headers for line in resp_headers: n, v = line.split(':', 1) n = n.lower() v = v.strip() if n in self.SKIP_HEADERS: continue new_v = v if n in self.UNREWRITE_HEADERS: new_v = self.unrewrite_header(cdx, v) if new_v != v: http_headers_buff += n + ': ' + new_v + '\r\n' else: http_headers_buff += line # if python2, already byte headers, so leave as is http_headers_buff += '\r\n' try: fp = upstream_res._fp.fp if hasattr(fp, 'raw'): #pragma: no cover fp = fp.raw remote_ip = fp._sock.getpeername()[0] except: #pragma: no cover remote_ip = None warc_headers = {} warc_headers['WARC-Type'] = 'response' warc_headers['WARC-Record-ID'] = self._make_warc_id() warc_headers['WARC-Target-URI'] = cdx['url'] warc_headers['WARC-Date'] = datetime_to_iso_date(dt) if not cdx.get('is_live'): now = datetime.datetime.utcnow() warc_headers['WARC-Source-URI'] = cdx.get('load_url') warc_headers['WARC-Creation-Date'] = datetime_to_iso_date(now) if remote_ip: warc_headers['WARC-IP-Address'] = remote_ip ct = upstream_res.headers.get('Content-Type') if ct: metadata = self.get_custom_metadata(ct, dt) if metadata: warc_headers['WARC-JSON-Metadata'] = json.dumps(metadata) warc_headers['Content-Type'] = 'application/http; msgtype=response' if method == 'HEAD': content_len = 0 else: content_len = upstream_res.headers.get('Content-Length', -1) self._set_content_len(content_len, warc_headers, len(http_headers_buff)) warc_headers = StatusAndHeaders('WARC/1.0', warc_headers.items()) return (warc_headers, http_headers_buff, upstream_res)
def __call__(self, cdx, params): dt = timestamp_to_datetime(cdx['timestamp']) return ('revisit', cdx['url'], datetime_to_iso_date(dt))
def load_resource(self, cdx, params): load_url = cdx.get('load_url') if not load_url: return None if params.get('content_type') == VideoLoader.CONTENT_TYPE: return None if self.forward_proxy_prefix and not cdx.get('is_live'): load_url = self.forward_proxy_prefix + load_url input_req = params['_input_req'] req_headers = input_req.get_req_headers() dt = timestamp_to_datetime(cdx['timestamp']) if cdx.get('memento_url'): req_headers['Accept-Datetime'] = datetime_to_http_date(dt) method = input_req.get_req_method() data = input_req.get_req_body() p = PreparedRequest() try: p.prepare_url(load_url, None) except: raise LiveResourceException(load_url) p.prepare_headers(None) p.prepare_auth(None, load_url) auth = p.headers.get('Authorization') if auth: req_headers['Authorization'] = auth load_url = p.url # host is set to the actual host for live loading # ensure it is set to the load_url host if not cdx.get('is_live'): #req_headers.pop('Host', '') req_headers['Host'] = urlsplit(p.url).netloc referrer = cdx.get('set_referrer') if referrer: req_headers['Referer'] = referrer upstream_res = self._do_request_with_redir_check(method, load_url, data, req_headers, params, cdx) memento_dt = upstream_res.headers.get('Memento-Datetime') if memento_dt: dt = http_date_to_datetime(memento_dt) cdx['timestamp'] = datetime_to_timestamp(dt) elif cdx.get('memento_url'): # if 'memento_url' set and no Memento-Datetime header present # then its an error return None agg_type = upstream_res.headers.get('Warcserver-Type') if agg_type == 'warc': cdx['source'] = unquote(upstream_res.headers.get('Warcserver-Source-Coll')) return None, upstream_res.headers, upstream_res if upstream_res.version == 11: version = '1.1' else: version = '1.0' status = 'HTTP/{version} {status} {reason}\r\n' status = status.format(version=version, status=upstream_res.status, reason=upstream_res.reason) http_headers_buff = status orig_resp = upstream_res._original_response try: #pragma: no cover #PY 3 resp_headers = orig_resp.headers._headers for n, v in resp_headers: nl = n.lower() if nl in self.SKIP_HEADERS: continue if nl in self.UNREWRITE_HEADERS: v = self.unrewrite_header(cdx, v) http_headers_buff += n + ': ' + v + '\r\n' http_headers_buff += '\r\n' try: # http headers could be encoded as utf-8 (though non-standard) # first try utf-8 encoding http_headers_buff = http_headers_buff.encode('utf-8') except: # then, fall back to latin-1 http_headers_buff = http_headers_buff.encode('latin-1') except: #pragma: no cover #PY 2 resp_headers = orig_resp.msg.headers for line in resp_headers: n, v = line.split(':', 1) n = n.lower() v = v.strip() if n in self.SKIP_HEADERS: continue new_v = v if n in self.UNREWRITE_HEADERS: new_v = self.unrewrite_header(cdx, v) if new_v != v: http_headers_buff += n + ': ' + new_v + '\r\n' else: http_headers_buff += line # if python2, already byte headers, so leave as is http_headers_buff += '\r\n' try: fp = upstream_res._fp.fp if hasattr(fp, 'raw'): #pragma: no cover fp = fp.raw remote_ip = fp._sock.getpeername()[0] except: #pragma: no cover remote_ip = None warc_headers = {} warc_headers['WARC-Type'] = 'response' warc_headers['WARC-Record-ID'] = self._make_warc_id() warc_headers['WARC-Target-URI'] = cdx['url'] warc_headers['WARC-Date'] = datetime_to_iso_date(dt) if not cdx.get('is_live'): now = datetime.datetime.utcnow() warc_headers['WARC-Source-URI'] = cdx.get('load_url') warc_headers['WARC-Creation-Date'] = datetime_to_iso_date(now) if remote_ip: warc_headers['WARC-IP-Address'] = remote_ip ct = upstream_res.headers.get('Content-Type') if ct: metadata = self.get_custom_metadata(ct, dt) if metadata: warc_headers['WARC-JSON-Metadata'] = json.dumps(metadata) warc_headers['Content-Type'] = 'application/http; msgtype=response' if method == 'HEAD': content_len = 0 else: content_len = upstream_res.headers.get('Content-Length', -1) self._set_content_len(content_len, warc_headers, len(http_headers_buff)) warc_headers = StatusAndHeaders('WARC/1.0', warc_headers.items()) return (warc_headers, http_headers_buff, upstream_res)
def _writeResponse(self, item, concurrentTo): # fetch the body reqId = item.id rawBody = None base64Encoded = False bodyTruncated = None if item.isRedirect: # redirects reuse the same request, thus we cannot safely retrieve # the body (i.e getResponseBody may return the new location’s # body). bodyTruncated = 'unspecified' elif item.encodedDataLength > self.maxBodySize: bodyTruncated = 'length' # check body size first, since we’re loading everything into memory self.logger.error('body for {} too large {} vs {}'.format( reqId, item.encodedDataLength, self.maxBodySize)) else: try: rawBody, base64Encoded = item.body except ValueError: # oops, don’t know what went wrong here bodyTruncated = 'unspecified' # now the response resp = item.response warcHeaders = { 'WARC-Concurrent-To': concurrentTo, 'WARC-IP-Address': resp.get('remoteIPAddress', ''), 'X-Chrome-Protocol': resp.get('protocol', ''), 'X-Chrome-FromDiskCache': str(resp.get('fromDiskCache')), 'X-Chrome-ConnectionReused': str(resp.get('connectionReused')), 'X-Chrome-Request-ID': item.id, 'WARC-Date': datetime_to_iso_date( datetime.utcfromtimestamp(item.chromeRequest['wallTime'] + (item.chromeResponse['timestamp'] - item.chromeRequest['timestamp']))), } if bodyTruncated: warcHeaders['WARC-Truncated'] = bodyTruncated else: warcHeaders['X-Chrome-Base64Body'] = str(base64Encoded) httpHeaders = StatusAndHeaders('{} {}'.format(resp['status'], item.statusText), item.responseHeaders, protocol='HTTP/1.1') # Content is saved decompressed and decoded, remove these headers blacklistedHeaders = {'transfer-encoding', 'content-encoding'} for h in blacklistedHeaders: httpHeaders.remove_header(h) # chrome sends nothing but utf8 encoded text. Fortunately HTTP # headers take precedence over the document’s <meta>, thus we can # easily override those. contentType = resp.get('mimeType') if contentType: if not base64Encoded: contentType += '; charset=utf-8' httpHeaders.replace_header('content-type', contentType) if rawBody is not None: httpHeaders.replace_header('content-length', '{:d}'.format(len(rawBody))) bodyIo = BytesIO(rawBody) else: bodyIo = BytesIO() record = self.writeRecord(resp['url'], 'response', warc_headers_dict=warcHeaders, payload=bodyIo, http_headers=httpHeaders) if item.resourceType == 'Document': self.documentRecords[item.url] = record.rec_headers.get_header( 'WARC-Record-ID')