class CCWARCWriter: def __init__(self, prefix, max_size, subprefix=None, gzip=True, get_serial=None): self.writer = None self.prefix = prefix self.subprefix = subprefix self.max_size = max_size self.gzip = gzip self.hostname = socket.gethostname() if get_serial is not None: self.external_get_serial = get_serial else: self.external_get_serial = None self.serial = 0 def __del__(self): if self.writer is not None: self.f.close() def create_default_info(self, version, warcheader_version, ip, description=None, creator=None, operator=None): ''' creator: # person, organization, service operator: # person, if creator is an organization isPartOf: # name of the crawl ''' info = OrderedDict() info[ 'software'] = 'cocrawler/' + version + ' cocrawler_warcheader_version/' + warcheader_version info['hostname'] = self.hostname info['ip'] = ip if description: info['description'] = description if creator: info['creator'] = creator if operator: info['operator'] = operator info[ 'isPartOf'] = self.prefix # intentionally does not include subprefix info['format'] = 'WARC file version 1.0' self.info = info return info def open(self): filename = self.prefix if self.subprefix: filename += '-' + str( self.subprefix) # don't let yaml leave this as an int serial = self.get_serial(filename) filename += '-' + serial + '-' + self.hostname + '.warc' if self.gzip: filename += '.gz' self.filename = filename self.f = open(filename, 'wb') self.writer = WARCWriter(self.f, gzip=self.gzip) record = self.writer.create_warcinfo_record(self.filename, self.info) self.writer.write_record(record) def get_serial(self, filename): if self.external_get_serial is not None: return self.external_get_serial(filename) self.serial += 1 return '{:06}'.format(self.serial - 1) def maybe_close(self): ''' TODO: always close/reopen if subprefix is not None; to minimize open filehandles? ''' fsize = os.fstat(self.f.fileno()).st_size if fsize > self.max_size: self.f.close() self.writer = None def write_dns(self, dns, ttl, url): # write it out even if empty # TODO: we filter the addresses early, should we warc the unfiltered dns repsonse? # the response object doesn't contain the query type 'A' or 'AAAA' # but it has family=2 AF_INET (ipv4) and flags=4 AI_NUMERICHOST -- that's 'A' kind = 'A' # fixme IPV6 ttl = int(ttl) host = url.hostname if self.writer is None: self.open() payload = timestamp_now() + '\r\n' for r in dns: try: payload += '\t'.join( (host + '.', str(ttl), 'IN', kind, r['host'])) + '\r\n' except Exception as e: LOGGER.info('problem converting dns reply for warcing', host, r, e) pass payload = payload.encode('utf-8') record = self.writer.create_warc_record('dns:' + host, 'resource', payload=BytesIO(payload), warc_content_type='text/dns', length=len(payload)) self.writer.write_record(record) LOGGER.debug('wrote warc dns response record%s for host %s', p(self.prefix), host) stats.stats_sum('warc dns' + p(self.prefix), 1) def _fake_resp_headers(self, resp_headers, body_len, decompressed=False): prefix = b'X-Crawler-' ret = [] for h, v in resp_headers: hl = h.lower() if hl == b'content-length': if not (v.isdigit() and int(v) == body_len): ret.append((prefix + h, v)) ret.append((b'Content-Length', str(body_len))) elif hl == b'content-encoding': if decompressed: ret.append((prefix + h, v)) else: ret.append((h, v)) elif hl == b'transfer-encoding': if v.lower() == b'chunked': # aiohttp always undoes chunking ret.append((prefix + h, v)) else: ret.append((h, v)) else: ret.append((h, v)) return ret def write_request_response_pair(self, url, ip, req_headers, resp_headers, is_truncated, payload, digest=None, decompressed=False): if self.writer is None: self.open() req_http_headers = StatusAndHeaders('GET / HTTP/1.1', req_headers) request = self.writer.create_warc_record('http://example.com/', 'request', http_headers=req_http_headers) fake_resp_headers = self._fake_resp_headers(resp_headers, len(payload), decompressed=decompressed) resp_http_headers = StatusAndHeaders('200 OK', fake_resp_headers, protocol='HTTP/1.1') warc_headers_dict = OrderedDict() if ip is not None: # ip should be here unless we crawl through a proxy warc_headers_dict['WARC-IP-Address'] = ip if digest is not None: warc_headers_dict['WARC-Payload-Digest'] = digest if is_truncated: if is_truncated in valid_truncations: warc_headers_dict['WARC-Truncated'] = is_truncated else: LOGGER.error('Invalid is_truncation of ' + is_truncated) warc_headers_dict['WARC-Truncated'] = 'unspecified' response = self.writer.create_warc_record( url, 'response', payload=BytesIO(payload), length=len(payload), warc_headers_dict=warc_headers_dict, http_headers=resp_http_headers) self.writer.write_request_response_pair(request, response) self.maybe_close() LOGGER.debug('wrote warc request-response pair%s for url %s', p(self.prefix), url) stats.stats_sum('warc r/r' + p(self.prefix), 1)
class HarParser(object): logger = logging.getLogger(__name__) def __init__(self, reader, writer, gzip=True): if isinstance(reader, str): with codecs.open(reader, encoding='utf-8') as fh: self.har = json.loads(fh.read()) elif hasattr(reader, 'read'): self.har = json.loads(reader.read()) elif isinstance(reader, dict): self.har = reader else: raise Exception('reader is in an unknown format') self.fh = None if isinstance(writer, BaseWARCWriter): self.writer = writer elif isinstance(writer, str): self.fh = open(writer, 'wb') self.writer = WARCWriter(self.fh, gzip=gzip) elif hasattr(writer, 'write'): self.writer = WARCWriter(writer, gzip=gzip) else: raise Exception('writer is in an unknown format') def parse(self, out_filename=None, rec_title=None): out_filename = out_filename or 'har.warc.gz' rec_title = rec_title or 'HAR Recording' metadata = self.create_wr_metadata(self.har['log'], rec_title) self.write_warc_info(self.har['log'], out_filename, metadata) for entry in self.har['log']['entries']: self.parse_entry(entry) if self.fh: self.fh.close() def parse_entry(self, entry): url = entry['request']['url'] response = self.parse_response(url, entry['response'], entry.get('serverIPAddress')) #TODO: support WARC/1.1 arbitrary precision dates! warc_date = entry['startedDateTime'][:19] + 'Z' response.rec_headers.replace_header('WARC-Date', warc_date) request = self.parse_request(entry['request']) self.writer.write_request_response_pair(request, response) def create_wr_metadata(self, log, rec_title): pagelist = [] for page in log['pages']: if not page['title'].startswith(('http:', 'https:')): continue pagelist.append(dict(title=page['title'], url=page['title'], timestamp=iso_date_to_timestamp(page['startedDateTime']))) metadata = {"title": rec_title, "type": "recording", } if pagelist: metadata["pages"] = pagelist return metadata def write_warc_info(self, log, filename, metadata): creator = '{0} {1}'.format(log['creator']['name'], log['creator']['version']) source = 'HAR Format {0}'.format(log['version']) software = 'har2warc ' + str(__version__) params = OrderedDict([('software', software), ('creator', creator), ('source', source), ('format', 'WARC File Format 1.0'), ('json-metadata', json.dumps(metadata))]) record = self.writer.create_warcinfo_record(filename, params) self.writer.write_record(record) def _get_http_version(self, entry): http_version = entry.get('httpVersion') if not http_version or http_version.upper() not in ('HTTP/1.1', 'HTTP/1.0'): http_version = 'HTTP/1.1' return http_version def parse_response(self, url, response, ip=None): headers = [] payload = BytesIO() content = response['content'].get('text', '') if not content and not response.get('headers'): self.logger.info('No headers or payload for: {0}'.format(url)) headers.append(('Content-Length', '0')) if response['content'].get('encoding') == 'base64': payload.write(base64.b64decode(content)) else: payload.write(content.encode('utf-8')) length = payload.tell() payload.seek(0) SKIP_HEADERS = ('content-encoding', 'transfer-encoding') http2 = False for header in response['headers']: if header['name'].lower() not in SKIP_HEADERS: headers.append((header['name'], header['value'])) #TODO: http2 detection -- write as same warc header? if (not http2 and header['name'] in (':method', ':scheme', ':path')): http2 = True status = response.get('status') or 204 reason = response.get('statusText') if not reason: reason = http_status_names.get(status, 'No Reason') status_line = str(status) + ' ' + reason proto = self._get_http_version(response) http_headers = StatusAndHeaders(status_line, headers, protocol=proto) if not content: content_length = http_headers.get_header('Content-Length', '0') if content_length != '0': self.logger.info('No Content for length {0} {1}'.format(content_length, url)) http_headers.replace_header('Content-Length', '0') else: http_headers.replace_header('Content-Length', str(length)) warc_headers_dict = {} if ip: warc_headers_dict['WARC-IP-Address'] = ip record = self.writer.create_warc_record(url, 'response', http_headers=http_headers, payload=payload, length=length, warc_headers_dict=warc_headers_dict) return record def parse_request(self, request): parts = urlsplit(request['url']) path = parts.path query = request.get('queryString') if query: path += '?' + urlencode(dict((p['name'], p['value']) for p in query)) headers = [] http2 = False for header in request['headers']: headers.append((header['name'], header['value'])) #TODO: http2 detection -- write as same warc header? if (not http2 and header['name'] in (':method', ':scheme', ':path')): http2 = True if http2: headers.append(('Host', parts.netloc)) http_version = self._get_http_version(request) status_line = request['method'] + ' ' + path + ' ' + http_version http_headers = StatusAndHeaders(status_line, headers) payload = None length = 0 if request['bodySize'] > 0: payload = BytesIO() payload.write(request['postData']['text'].encode('utf-8')) length = payload.tell() payload.seek(0) record = self.writer.create_warc_record(request['url'], 'request', http_headers=http_headers, payload=payload, length=length) return record
class CCWARCWriter: def __init__(self, prefix, max_size, subprefix=None, gzip=True, get_serial=None): self.writer = None self.prefix = prefix self.subprefix = subprefix self.max_size = max_size self.gzip = gzip self.hostname = socket.gethostname() if get_serial is not None: self.external_get_serial = get_serial else: self.external_get_serial = None self.serial = 0 def __del__(self): if self.writer is not None: self.f.close() def create_default_info(self, version, ip, description=None, creator=None, operator=None): ''' creator: # person, organization, service operator: # person, if creator is an organization isPartOf: # name of the crawl ''' info = OrderedDict() info['software'] = 'cocrawler/' + version info['hostname'] = self.hostname info['ip'] = ip if description: info['description'] = description if creator: info['creator'] = creator if operator: info['operator'] = operator info[ 'isPartOf'] = self.prefix # intentionally does not include subprefix info['format'] = 'WARC file version 1.0' self.info = info return info def open(self): filename = self.prefix if self.subprefix: filename += '-' + self.subprefix serial = self.get_serial(filename) filename += '-' + serial + '-' + self.hostname + '.warc' if self.gzip: filename += '.gz' self.filename = filename self.f = open(filename, 'wb') self.writer = WARCWriter(self.f, gzip=self.gzip) record = self.writer.create_warcinfo_record(self.filename, self.info) self.writer.write_record(record) def get_serial(self, filename): if self.external_get_serial is not None: return self.external_get_serial(filename) self.serial += 1 return '{:06}'.format(self.serial - 1) def maybe_close(self): ''' TODO: always close/reopen if subprefix is not None; minimizes open filehandles ''' fsize = os.fstat(self.f.fileno()).st_size if fsize > self.max_size: self.f.close() self.writer = None def write_dns(self, dns, expires, url): # write it out even if empty # TODO: we filter the addresses early, should we warc the unfiltered dns repsonse? # the response object doesn't contain the query type 'A' or 'AAAA' # but it has family=2 AF_INET (ipv4) and flags=4 AI_NUMERICHOST -- that's 'A' kind = 'A' # fixme IPV6 ttl = int(expires - time.time()) host = url.hostname if self.writer is None: self.open() payload = timestamp_now() + '\r\n' for r in dns: try: payload += host + '.\t' + str( ttl) + '\tIN\t' + kind + '\t' + r['host'] + '\r\n' except Exception as e: LOGGER.info('problem converting dns reply for warcing', host, r, e) pass payload = payload.encode('utf-8') record = self.writer.create_warc_record('dns:' + host, 'resource', payload=BytesIO(payload), warc_content_type='text/dns', length=len(payload)) self.writer.write_record(record) LOGGER.debug('wrote warc dns response record%s for host %s', p(self.prefix), host) stats.stats_sum('warc dns' + p(self.prefix), 1) def write_request_response_pair(self, url, req_headers, resp_headers, is_truncated, payload, digest=None): if self.writer is None: self.open() # XXX WARC-Identified-Payload-Type set from Apache Tika? (done by Common Crawl) (how expensive?) req_http_headers = StatusAndHeaders( 'GET / HTTP/1.1', headers_to_str_headers(req_headers)) request = self.writer.create_warc_record('http://example.com/', 'request', http_headers=req_http_headers) resp_http_headers = StatusAndHeaders( '200 OK', headers_to_str_headers(resp_headers), protocol='HTTP/1.1') warc_headers_dict = {} if digest is not None: warc_headers_dict['WARC-Payload-Digest'] = digest if is_truncated: if is_truncated in valid_truncations: warc_headers_dict['WARC-Truncated'] = is_truncated else: LOGGER.error('Invalid is_truncation of ' + is_truncated) warc_headers_dict['WARC-Truncated'] = 'unspecified' response = self.writer.create_warc_record( url, 'response', payload=BytesIO(payload), length=len(payload), warc_headers_dict=warc_headers_dict, http_headers=resp_http_headers) self.writer.write_request_response_pair(request, response) self.maybe_close() LOGGER.debug('wrote warc request-response pair%s for url %s', p(self.prefix), url) stats.stats_sum('warc r/r' + p(self.prefix), 1)