def _build_response_principal_record(self, recorded_url, warc_date): """Builds response or revisit record, whichever is appropriate.""" if hasattr(recorded_url, "dedup_info") and recorded_url.dedup_info: # revisit record recorded_url.response_recorder.tempfile.seek(0) if recorded_url.response_recorder.payload_offset is not None: response_header_block = recorded_url.response_recorder.tempfile.read(recorded_url.response_recorder.payload_offset) else: response_header_block = recorded_url.response_recorder.tempfile.read() return self.build_warc_record( url=recorded_url.url, warc_date=warc_date, data=response_header_block, warc_type=warctools.WarcRecord.REVISIT, refers_to=recorded_url.dedup_info.get('id'), refers_to_target_uri=recorded_url.dedup_info['url'], refers_to_date=recorded_url.dedup_info['date'], payload_digest=warcprox.digest_str( recorded_url.payload_digest, self.base32), profile=warctools.WarcRecord.PROFILE_IDENTICAL_PAYLOAD_DIGEST, content_type=hanzo.httptools.ResponseMessage.CONTENT_TYPE, remote_ip=recorded_url.remote_ip) else: # response record return self.build_warc_record( url=recorded_url.url, warc_date=warc_date, recorder=recorded_url.response_recorder, warc_type=warctools.WarcRecord.RESPONSE, content_type=hanzo.httptools.ResponseMessage.CONTENT_TYPE, remote_ip=recorded_url.remote_ip, payload_digest=warcprox.digest_str( recorded_url.payload_digest, self.base32), truncated=recorded_url.truncated)
def _build_response_principal_record(self, recorded_url, warc_date): """Builds response or revisit record, whichever is appropriate.""" if hasattr(recorded_url, "dedup_info") and recorded_url.dedup_info: # revisit record recorded_url.response_recorder.tempfile.seek(0) if recorded_url.response_recorder.payload_offset is not None: response_header_block = recorded_url.response_recorder.tempfile.read(recorded_url.response_recorder.payload_offset) else: response_header_block = recorded_url.response_recorder.tempfile.read() return self.build_warc_record( url=recorded_url.url, warc_date=warc_date, data=response_header_block, warc_type=warctools.WarcRecord.REVISIT, refers_to=recorded_url.dedup_info.get('id'), refers_to_target_uri=recorded_url.dedup_info['url'], refers_to_date=recorded_url.dedup_info['date'], payload_digest=warcprox.digest_str( recorded_url.payload_digest, self.base32), profile=warctools.WarcRecord.PROFILE_IDENTICAL_PAYLOAD_DIGEST, content_type=hanzo.httptools.ResponseMessage.CONTENT_TYPE, remote_ip=recorded_url.remote_ip) else: # response record return self.build_warc_record( url=recorded_url.url, warc_date=warc_date, recorder=recorded_url.response_recorder, warc_type=warctools.WarcRecord.RESPONSE, content_type=hanzo.httptools.ResponseMessage.CONTENT_TYPE, remote_ip=recorded_url.remote_ip, payload_digest=warcprox.digest_str( recorded_url.payload_digest, self.base32))
def build_warc_record(self, url, warc_date=None, recorder=None, data=None, concurrent_to=None, warc_type=None, content_type=None, remote_ip=None, profile=None, refers_to=None, refers_to_target_uri=None, refers_to_date=None, payload_digest=None): if warc_date is None: warc_date = warctools.warc.warc_datetime_str(datetime.datetime.utcnow()) record_id = warctools.WarcRecord.random_warc_uuid() headers = [] if warc_type is not None: headers.append((warctools.WarcRecord.TYPE, warc_type)) headers.append((warctools.WarcRecord.ID, record_id)) headers.append((warctools.WarcRecord.DATE, warc_date)) headers.append((warctools.WarcRecord.URL, url)) if remote_ip is not None: headers.append((warctools.WarcRecord.IP_ADDRESS, remote_ip)) if profile is not None: headers.append((warctools.WarcRecord.PROFILE, profile)) if refers_to is not None: headers.append((warctools.WarcRecord.REFERS_TO, refers_to)) if refers_to_target_uri is not None: headers.append((warctools.WarcRecord.REFERS_TO_TARGET_URI, refers_to_target_uri)) if refers_to_date is not None: headers.append((warctools.WarcRecord.REFERS_TO_DATE, refers_to_date)) if concurrent_to is not None: headers.append((warctools.WarcRecord.CONCURRENT_TO, concurrent_to)) if content_type is not None: headers.append((warctools.WarcRecord.CONTENT_TYPE, content_type)) if payload_digest is not None: headers.append((warctools.WarcRecord.PAYLOAD_DIGEST, payload_digest)) if recorder is not None: headers.append((warctools.WarcRecord.CONTENT_LENGTH, str(len(recorder)).encode('latin1'))) headers.append((warctools.WarcRecord.BLOCK_DIGEST, warcprox.digest_str(recorder.block_digest, self.base32))) if recorder.payload_digest is not None: headers.append((warctools.WarcRecord.PAYLOAD_DIGEST, warcprox.digest_str(recorder.payload_digest, self.base32))) recorder.tempfile.seek(0) record = warctools.WarcRecord(headers=headers, content_file=recorder.tempfile) else: headers.append((warctools.WarcRecord.CONTENT_LENGTH, str(len(data)).encode('latin1'))) digest = hashlib.new(self.digest_algorithm, data) headers.append((warctools.WarcRecord.BLOCK_DIGEST, warcprox.digest_str(digest, self.base32))) if not payload_digest: headers.append((warctools.WarcRecord.PAYLOAD_DIGEST, warcprox.digest_str(digest, self.base32))) content_tuple = content_type, data record = warctools.WarcRecord(headers=headers, content=content_tuple) return record
def build_warc_record(self, url, warc_date=None, recorder=None, data=None, concurrent_to=None, warc_type=None, content_type=None, remote_ip=None, profile=None, refers_to=None, refers_to_target_uri=None, refers_to_date=None, payload_digest=None): if warc_date is None: warc_date = warctools.warc.warc_datetime_str(datetime.datetime.utcnow()) record_id = warctools.WarcRecord.random_warc_uuid() headers = [] if warc_type is not None: headers.append((warctools.WarcRecord.TYPE, warc_type)) headers.append((warctools.WarcRecord.ID, record_id)) headers.append((warctools.WarcRecord.DATE, warc_date)) headers.append((warctools.WarcRecord.URL, url)) if remote_ip is not None: headers.append((warctools.WarcRecord.IP_ADDRESS, remote_ip)) if profile is not None: headers.append((warctools.WarcRecord.PROFILE, profile)) if refers_to is not None: headers.append((warctools.WarcRecord.REFERS_TO, refers_to)) if refers_to_target_uri is not None: headers.append((warctools.WarcRecord.REFERS_TO_TARGET_URI, refers_to_target_uri)) if refers_to_date is not None: headers.append((warctools.WarcRecord.REFERS_TO_DATE, refers_to_date)) if concurrent_to is not None: headers.append((warctools.WarcRecord.CONCURRENT_TO, concurrent_to)) if content_type is not None: headers.append((warctools.WarcRecord.CONTENT_TYPE, content_type)) if payload_digest is not None: headers.append((warctools.WarcRecord.PAYLOAD_DIGEST, payload_digest)) if recorder is not None: headers.append((warctools.WarcRecord.CONTENT_LENGTH, str(len(recorder)).encode('latin1'))) headers.append((warctools.WarcRecord.BLOCK_DIGEST, warcprox.digest_str(recorder.block_digest, self.base32))) recorder.tempfile.seek(0) record = warctools.WarcRecord(headers=headers, content_file=recorder.tempfile) else: headers.append((warctools.WarcRecord.CONTENT_LENGTH, str(len(data)).encode('latin1'))) digest = hashlib.new(self.digest_algorithm, data) headers.append((warctools.WarcRecord.BLOCK_DIGEST, warcprox.digest_str(digest, self.base32))) if not payload_digest: headers.append((warctools.WarcRecord.PAYLOAD_DIGEST, warcprox.digest_str(digest, self.base32))) content_tuple = content_type, data record = warctools.WarcRecord(headers=headers, content=content_tuple) return record
def _filter_and_bucketize(self, batch): ''' Returns `{bucket: [recorded_url, ...]}`, excluding urls that should not be looked up. ''' buckets = collections.defaultdict(list) discards = [] for recorded_url in batch: if (recorded_url.response_recorder and recorded_url.payload_digest and self.trough_dedup_db.should_dedup(recorded_url)): if (recorded_url.warcprox_meta and 'dedup-bucket' in recorded_url.warcprox_meta): bucket = recorded_url.warcprox_meta['dedup-bucket'] else: bucket = '__unspecified__' buckets[bucket].append(recorded_url) else: discards.append( warcprox.digest_str( recorded_url.payload_digest, self.options.base32) if recorded_url.payload_digest else 'n/a') self.logger.debug( 'len(batch)=%s len(discards)=%s buckets=%s', len(batch), len(discards), {bucket: len(buckets[bucket]) for bucket in buckets}) return buckets
def _filter_and_bucketize(self, batch): ''' Returns `{bucket: [recorded_url, ...]}`, excluding urls that should not be looked up. ''' buckets = collections.defaultdict(list) discards = [] for recorded_url in batch: if (recorded_url.response_recorder and recorded_url.payload_digest and self.trough_dedup_db.should_dedup(recorded_url)): if (recorded_url.warcprox_meta and 'dedup-bucket' in recorded_url.warcprox_meta): bucket = recorded_url.warcprox_meta['dedup-bucket'] else: bucket = '__unspecified__' buckets[bucket].append(recorded_url) else: discards.append( warcprox.digest_str( recorded_url.payload_digest, self.options.base32) if recorded_url.payload_digest else 'n/a') self.logger.debug( 'len(batch)=%s len(discards)=%s buckets=%s', len(batch), len(discards), {bucket: len(buckets[bucket]) for bucket in buckets}) return buckets
def build_warc_records(self, recorded_url): """Returns a tuple of hanzo.warctools.warc.WarcRecord (principal_record, ...)""" warc_date = self.format_warc_date(recorded_url.timestamp) if recorded_url.response_recorder: principal_record = self._build_response_principal_record( recorded_url, warc_date) request_record = self.build_warc_record( url=recorded_url.url, warc_date=warc_date, data=recorded_url.request_data, warc_type=warctools.WarcRecord.REQUEST, content_type=hanzo.httptools.RequestMessage.CONTENT_TYPE, concurrent_to=principal_record.id) return principal_record, request_record else: principal_record = self.build_warc_record( url=recorded_url.url, warc_date=warc_date, data=recorded_url.request_data, warc_type=recorded_url.custom_type, content_type=recorded_url.content_type.encode("latin1"), payload_digest=warcprox.digest_str(recorded_url.payload_digest, self.base32), content_length=recorded_url.size) return (principal_record, )
def notify(self, recorded_url, records): if (records and records[0].type == b'response' and self.should_dedup(recorded_url)): digest_key = warcprox.digest_str( recorded_url.payload_digest, self.options.base32) if recorded_url.warcprox_meta and "dedup-bucket" in recorded_url.warcprox_meta: self.save(digest_key, records[0], bucket=recorded_url.warcprox_meta["dedup-bucket"]) else: self.save(digest_key, records[0])
def notify(self, recorded_url, records): if (records and records[0].type == b'response' and recorded_url.response_recorder.payload_size() > 0): digest_key = warcprox.digest_str( recorded_url.payload_digest, self.options.base32) if recorded_url.warcprox_meta and "captures-bucket" in recorded_url.warcprox_meta: self.save(digest_key, records[0], bucket=recorded_url.warcprox_meta["captures-bucket"]) else: self.save(digest_key, records[0])
def notify(self, recorded_url, records): if (records and records[0].type == b'response' and self.should_dedup(recorded_url)): digest_key = warcprox.digest_str( recorded_url.payload_digest, self.options.base32) if recorded_url.warcprox_meta and "dedup-bucket" in recorded_url.warcprox_meta: self.save(digest_key, records[0], bucket=recorded_url.warcprox_meta["dedup-bucket"]) else: self.save(digest_key, records[0])
def notify(self, recorded_url, records): if (records and records[0].type == b'response' and recorded_url.response_recorder.payload_size() > 0): digest_key = warcprox.digest_str( recorded_url.payload_digest, self.options.base32) if recorded_url.warcprox_meta and "captures-bucket" in recorded_url.warcprox_meta: self.save(digest_key, records[0], bucket=recorded_url.warcprox_meta["captures-bucket"]) else: self.save(digest_key, records[0])
def notify(self, recorded_url, records): if (records and records[0].type == b'response' and self.should_dedup(recorded_url)): digest_key = warcprox.digest_str( recorded_url.payload_digest, self.options.base32) if recorded_url.warcprox_meta and "dedup-buckets" in recorded_url.warcprox_meta: for bucket, bucket_mode in recorded_url.warcprox_meta["dedup-buckets"].items(): if not bucket_mode == 'ro': self.save(digest_key, records[0], bucket=bucket) else: self.save(digest_key, records[0])
def decorate_with_dedup_info(dedup_db, recorded_url, base32=False): if (recorded_url.response_recorder and recorded_url.payload_digest and recorded_url.response_recorder.payload_size() > 0): digest_key = warcprox.digest_str(recorded_url.payload_digest, base32) if recorded_url.warcprox_meta and "captures-bucket" in recorded_url.warcprox_meta: recorded_url.dedup_info = dedup_db.lookup( digest_key, recorded_url.warcprox_meta["captures-bucket"], recorded_url.url) else: recorded_url.dedup_info = dedup_db.lookup(digest_key, url=recorded_url.url)
def decorate_with_dedup_info(dedup_db, recorded_url, base32=False): if (recorded_url.response_recorder and recorded_url.payload_digest and recorded_url.response_recorder.payload_size() > 0): digest_key = warcprox.digest_str(recorded_url.payload_digest, base32) if recorded_url.warcprox_meta and "captures-bucket" in recorded_url.warcprox_meta: recorded_url.dedup_info = dedup_db.lookup(digest_key, recorded_url.warcprox_meta["captures-bucket"], recorded_url.url) else: recorded_url.dedup_info = dedup_db.lookup(digest_key, url=recorded_url.url)
def _process_url(self, recorded_url): if (recorded_url.response_recorder and recorded_url.payload_digest and self.should_dedup(recorded_url)): digest_key = warcprox.digest_str(recorded_url.payload_digest, self.options.base32) if recorded_url.warcprox_meta and "dedup-bucket" in recorded_url.warcprox_meta: recorded_url.dedup_info = self.dedup_db.lookup( digest_key, recorded_url.warcprox_meta["dedup-bucket"], recorded_url.url) else: recorded_url.dedup_info = self.dedup_db.lookup( digest_key, url=recorded_url.url)
def _process_url(self, recorded_url): if (recorded_url.response_recorder and recorded_url.payload_digest and self.should_dedup(recorded_url)): digest_key = warcprox.digest_str(recorded_url.payload_digest, self.options.base32) if recorded_url.warcprox_meta and "dedup-bucket" in recorded_url.warcprox_meta: recorded_url.dedup_info = self.dedup_db.lookup( digest_key, recorded_url.warcprox_meta["dedup-bucket"], recorded_url.url) else: recorded_url.dedup_info = self.dedup_db.lookup( digest_key, url=recorded_url.url)
def notify(self, recorded_url, records): # 2017-08-03T21:45:24.496Z 200 2189 https://autismcouncil.wisconsin.gov/robots.txt P https://autismcouncil.wisconsin.gov/ text/plain #001 20170803214523617+365 sha1:PBS2CEF7B4OSEXZZF3QE2XN2VHYCPNPX https://autismcouncil.wisconsin.gov/ duplicate:digest {"warcFileOffset":942,"contentSize":2495,"warcFilename":"ARCHIVEIT-2159-TEST-JOB319150-20170803214522386-00000.warc.gz"} now = datetime.datetime.utcnow() extra_info = { 'contentSize': recorded_url.size, } if records: extra_info['warcFilename'] = records[0].warc_filename extra_info['warcFileOffset'] = records[0].offset if recorded_url.response_recorder: content_length = recorded_url.response_recorder.len - recorded_url.response_recorder.payload_offset payload_digest = warcprox.digest_str(recorded_url.payload_digest, self.options.base32) else: # WARCPROX_WRITE_RECORD request content_length = len(recorded_url.request_data) payload_digest = records[0].get_header(b'WARC-Payload-Digest') fields = [ '{:%Y-%m-%dT%H:%M:%S}.{:03d}Z'.format(now, now.microsecond // 1000), '% 5s' % recorded_url.status, '% 10s' % content_length, recorded_url.url, '-', # hop path recorded_url.referer or '-', recorded_url.mimetype or '-', '-', '{:%Y%m%d%H%M%S}{:03d}+{:03d}'.format( recorded_url.timestamp, recorded_url.timestamp.microsecond // 1000, recorded_url.duration.microseconds // 1000), payload_digest, recorded_url.warcprox_meta.get('metadata', {}).get('seed', '-'), 'duplicate:digest' if records and records[0].type == b'revisit' else '-', json.dumps(extra_info, separators=(',', ':')), ] for i in range(len(fields)): # `fields` is a mix of `bytes` and `unicode`, make them all `bytes` try: fields[i] = fields[i].encode('utf-8') except: pass line = b' '.join(fields) + b'\n' if 'warc-prefix' in recorded_url.warcprox_meta: filename = '%s.log' % recorded_url.warcprox_meta['warc-prefix'] else: filename = 'crawl.log' crawl_log_path = os.path.join(self.dir, filename) with open(crawl_log_path, 'ab') as f: f.write(line)
def notify(self, recorded_url, records): if (records[0].get_header( warctools.WarcRecord.TYPE) == warctools.WarcRecord.RESPONSE and recorded_url.response_recorder.payload_size() > 0): digest_key = warcprox.digest_str( recorded_url.response_recorder.payload_digest, self.options.base32) if recorded_url.warcprox_meta and "captures-bucket" in recorded_url.warcprox_meta: self.save(digest_key, records[0], bucket=recorded_url.warcprox_meta["captures-bucket"]) else: self.save(digest_key, records[0])
def notify(self, recorded_url, records): # 2017-08-03T21:45:24.496Z 200 2189 https://autismcouncil.wisconsin.gov/robots.txt P https://autismcouncil.wisconsin.gov/ text/plain #001 20170803214523617+365 sha1:PBS2CEF7B4OSEXZZF3QE2XN2VHYCPNPX https://autismcouncil.wisconsin.gov/ duplicate:digest {"warcFileOffset":942,"contentSize":2495,"warcFilename":"ARCHIVEIT-2159-TEST-JOB319150-20170803214522386-00000.warc.gz"} now = datetime.datetime.utcnow() extra_info = {'contentSize': recorded_url.size,} if records: extra_info['warcFilename'] = records[0].warc_filename extra_info['warcFileOffset'] = records[0].offset if recorded_url.method != 'GET': extra_info['method'] = recorded_url.method if recorded_url.response_recorder: content_length = recorded_url.response_recorder.len - recorded_url.response_recorder.payload_offset payload_digest = warcprox.digest_str( recorded_url.payload_digest, self.options.base32) else: # WARCPROX_WRITE_RECORD request content_length = int(records[0].get_header(b'Content-Length')) payload_digest = records[0].get_header(b'WARC-Payload-Digest') fields = [ '{:%Y-%m-%dT%H:%M:%S}.{:03d}Z'.format(now, now.microsecond//1000), '% 5s' % recorded_url.status, '% 10s' % content_length, recorded_url.url, '-', # hop path recorded_url.referer or '-', recorded_url.mimetype or '-', '-', '{:%Y%m%d%H%M%S}{:03d}+{:03d}'.format( recorded_url.timestamp, recorded_url.timestamp.microsecond//1000, recorded_url.duration.microseconds//1000), payload_digest, recorded_url.warcprox_meta.get('metadata', {}).get('seed', '-'), 'duplicate:digest' if records and records[0].type == b'revisit' else '-', json.dumps(extra_info, separators=(',',':')), ] for i in range(len(fields)): # `fields` is a mix of `bytes` and `unicode`, make them all `bytes` try: fields[i] = fields[i].encode('utf-8') except: pass line = b' '.join(fields) + b'\n' prefix = recorded_url.warcprox_meta.get('warc-prefix', 'crawl') filename = '%s-%s-%s.log' % ( prefix, self.hostname, self.options.server_port) crawl_log_path = os.path.join(self.dir, filename) with open(crawl_log_path, 'ab') as f: f.write(line)
def _process_url(self, recorded_url): try: digest_key = warcprox.digest_str(recorded_url.payload_digest, self.options.base32) dedup_info = self.cdx_dedup.lookup(digest_key, recorded_url.url) if dedup_info: recorded_url.dedup_info = dedup_info except ValueError as exc: self.logger.error('CdxServerDedupLoader _process_url failed for url=%s %s', recorded_url.url, exc) finally: self.batch.remove(recorded_url) if self.outq: self.outq.put(recorded_url)
def batch_save(self, batch, bucket='__unspecified__'): sql_tmpl = ('insert or ignore into dedup\n' '(digest_key, url, date, id)\n' 'values %s;' % ','.join( '(%s,%s,%s,%s)' for i in range(len(batch)))) values = [] for recorded_url in batch: values.extend([ warcprox.digest_str( recorded_url.payload_digest, self.options.base32), recorded_url.url, recorded_url.warc_records[0].date, recorded_url.warc_records[0].id,]) self._trough_cli.write(bucket, sql_tmpl, values, self.SCHEMA_ID)
def batch_save(self, batch, bucket='__unspecified__'): sql_tmpl = ('insert or ignore into dedup\n' '(digest_key, url, date, id)\n' 'values %s;' % ','.join( '(%s,%s,%s,%s)' for i in range(len(batch)))) values = [] for recorded_url in batch: values.extend([ warcprox.digest_str( recorded_url.payload_digest, self.options.base32), recorded_url.url, recorded_url.warc_records[0].date, recorded_url.warc_records[0].id,]) self._trough_cli.write(bucket, sql_tmpl, values, self.SCHEMA_ID)
def to_json(recorded_url: warcprox.warcproxy.RecordedUrl, records: List[warctools.WarcRecord]): # {"status_code":200,"content_digest":"sha1:3VU56HI3BTMDZBL2TP7SQYXITT7VEAJQ","host":"www.kaosgl.com","via":"http://www.kaosgl.com/sayfa.php?id=4427","account_id":"877","seed":"http://www.kaosgl.com/","warc_filename":"ARCHIVEIT-6003-WEEKLY-JOB171310-20150903100014694-00002.warc.gz","url":"http://www.kaosgl.com/resim/HomofobiKarsitiBulusma/trabzon05.jpg","size":29700,"start_time_plus_duration":"20150903175709637+1049","timestamp":"2015-09-03T17:57:10.707Z","mimetype":"image/jpeg","collection_id":"6003","is_test_crawl":"false","job_name":"6003-20150902172136074","warc_offset":856320200,"thread":6,"hop_path":"RLLLLLE","extra_info":{},"annotations":"duplicate:digest","content_length":29432} # Normal recorded response: if recorded_url.response_recorder: content_length = recorded_url.response_recorder.len - recorded_url.response_recorder.payload_offset payload_digest = warcprox.digest_str( recorded_url.payload_digest, True) else: # WARCPROX_WRITE_RECORD request: content_length = recorded_url.size payload_digest = records[0].get_header(b'WARC-Payload-Digest') # Deal with variation in content type: content_type = recorded_url.mimetype if content_type and content_type.find(" ") >= 0: content_type = "application/malformed-header" # Build the record: now = datetime.datetime.utcnow() d = { 'url': recorded_url.url.decode('utf-8'), 'host': recorded_url.host, 'http_method': recorded_url.method, 'status_code': recorded_url.status, 'wire_bytes': recorded_url.size, 'content_type': content_type, 'content_digest': payload_digest.decode("utf-8"), 'content_length': content_length, 'start_time_plus_duration': '{:%Y%m%d%H%M%S}{:03d}+{}'.format( recorded_url.timestamp, recorded_url.timestamp.microsecond // 1000, int(recorded_url.duration.total_seconds() * 1000)), 'annotations': 'duplicate:digest' if records[0].type == 'revisit' else '', 'warc_filename': os.path.basename(records[0].warc_filename), 'warc_offset': records[0].offset, 'warc_length': records[0].length, "warc_content_type": records[0].content_type.decode("utf-8"), "warc_type": records[0].type.decode("utf-8"), "warc_id": records[0].id.decode("utf-8"), 'timestamp': '{:%Y-%m-%dT%H:%M:%S}.{:03d}Z'.format(now, now.microsecond // 1000) } # fields expected to be populated here are (for archive-it): # account_id, collection_id, is_test_crawl, seed, job_name if recorded_url.warcprox_meta and 'capture-feed-extra-fields' in recorded_url.warcprox_meta: for (k, v) in recorded_url.warcprox_meta['capture-feed-extra-fields'].items(): d[k] = v return d
def _process_url(self, recorded_url): if (recorded_url.response_recorder and recorded_url.payload_digest and self.should_dedup(recorded_url)): digest_key = warcprox.digest_str(recorded_url.payload_digest, self.options.base32) if recorded_url.warcprox_meta and "dedup-buckets" in recorded_url.warcprox_meta: for bucket, bucket_mode in recorded_url.warcprox_meta["dedup-buckets"].items(): recorded_url.dedup_info = self.dedup_db.lookup( digest_key, bucket, recorded_url.url) if recorded_url.dedup_info: # we found an existing capture break else: recorded_url.dedup_info = self.dedup_db.lookup( digest_key, url=recorded_url.url)
def _build_key_index(self, batch): ''' Builds index of RecordedUrl by digest key. Args: batch(list): list of RecordedUrl Returns: dict `{digest_key: [recorded_url, ...]}` ''' key_index = collections.defaultdict(list) for recorded_url in batch: digest_key = warcprox.digest_str( recorded_url.payload_digest, self.options.base32) key_index[digest_key].append(recorded_url) return key_index
def _build_key_index(self, batch): ''' Builds index of RecordedUrl by digest key. Args: batch(list): list of RecordedUrl Returns: dict `{digest_key: [recorded_url, ...]}` ''' key_index = collections.defaultdict(list) for recorded_url in batch: digest_key = warcprox.digest_str( recorded_url.payload_digest, self.options.base32) key_index[digest_key].append(recorded_url) return key_index
def _process_url(self, recorded_url): try: digest_key = warcprox.digest_str(recorded_url.payload_digest, self.options.base32) dedup_info = self.cdx_dedup.cached_lookup(digest_key, recorded_url.url) cache_info = self.cdx_dedup.cached_lookup.cache_info() if (cache_info.hits + cache_info.misses) % 1000 == 0: self.logger.info(self.cdx_dedup.cached_lookup.cache_info()) if dedup_info: recorded_url.dedup_info = dedup_info except ValueError as exc: self.logger.error('CdxServerDedupLoader _process_url failed for url=%s %s', recorded_url.url, exc) finally: self.batch.remove(recorded_url) if self.outq: self.outq.put(recorded_url)
def build_warc_records(self, recorded_url): """Returns a tuple of hanzo.warctools.warc.WarcRecord (principal_record, ...)""" warc_date = self.format_warc_date(recorded_url.timestamp) if recorded_url.response_recorder: principal_record = self._build_response_principal_record(recorded_url, warc_date) request_record = self.build_warc_record(url=recorded_url.url, warc_date=warc_date, data=recorded_url.request_data, warc_type=warctools.WarcRecord.REQUEST, content_type=hanzo.httptools.RequestMessage.CONTENT_TYPE, concurrent_to=principal_record.id) return principal_record, request_record else: principal_record = self.build_warc_record( url=recorded_url.url, warc_date=warc_date, data=recorded_url.request_data, warc_type=recorded_url.custom_type, content_type=recorded_url.content_type.encode("latin1"), payload_digest=warcprox.digest_str( recorded_url.payload_digest, self.base32), content_length=recorded_url.size) return (principal_record,)
def build_warc_record(self, url, warc_date=None, recorder=None, data=None, concurrent_to=None, warc_type=None, content_type=None, remote_ip=None, profile=None, refers_to=None, refers_to_target_uri=None, refers_to_date=None, payload_digest=None, truncated=None, content_length=None): if warc_date is None: warc_date = self.format_warc_date(datetime.datetime.utcnow()) record_id = warctools.WarcRecord.random_warc_uuid() headers = [] if warc_type is not None: headers.append((warctools.WarcRecord.TYPE, warc_type)) headers.append((warctools.WarcRecord.ID, record_id)) headers.append((warctools.WarcRecord.DATE, warc_date)) headers.append((warctools.WarcRecord.URL, url)) if remote_ip is not None: headers.append((warctools.WarcRecord.IP_ADDRESS, remote_ip)) if profile is not None: headers.append((warctools.WarcRecord.PROFILE, profile)) if refers_to is not None: headers.append((warctools.WarcRecord.REFERS_TO, refers_to)) if refers_to_target_uri is not None: headers.append((warctools.WarcRecord.REFERS_TO_TARGET_URI, refers_to_target_uri)) if refers_to_date is not None: headers.append((warctools.WarcRecord.REFERS_TO_DATE, refers_to_date)) if concurrent_to is not None: headers.append((warctools.WarcRecord.CONCURRENT_TO, concurrent_to)) if content_type is not None: headers.append((warctools.WarcRecord.CONTENT_TYPE, content_type)) if payload_digest is not None: headers.append((warctools.WarcRecord.PAYLOAD_DIGEST, payload_digest)) # truncated value may be 'length' or 'time' if truncated is not None: headers.append((b'WARC-Truncated', truncated)) if recorder is not None: if content_length is not None: headers.append(( warctools.WarcRecord.CONTENT_LENGTH, str(content_length).encode('latin1'))) else: headers.append(( warctools.WarcRecord.CONTENT_LENGTH, str(len(recorder)).encode('latin1'))) headers.append((warctools.WarcRecord.BLOCK_DIGEST, warcprox.digest_str(recorder.block_digest, self.base32))) recorder.tempfile.seek(0) record = warctools.WarcRecord(headers=headers, content_file=recorder.tempfile) else: if content_length is not None: headers.append(( warctools.WarcRecord.CONTENT_LENGTH, str(content_length).encode('latin1'))) else: headers.append(( warctools.WarcRecord.CONTENT_LENGTH, str(len(data)).encode('latin1'))) # no http headers so block digest == payload digest if not payload_digest: payload_digest = warcprox.digest_str( hashlib.new(self.digest_algorithm, data), self.base32) headers.append(( warctools.WarcRecord.PAYLOAD_DIGEST, payload_digest)) headers.append((warctools.WarcRecord.BLOCK_DIGEST, payload_digest)) if hasattr(data, 'read'): record = warctools.WarcRecord( headers=headers, content_file=data) else: content_tuple = content_type, data record = warctools.WarcRecord( headers=headers, content=content_tuple) return record
def put(self, recorded_url, block=True, timeout=None): logging.info("{} {} {} {} {} size={} {}".format( recorded_url.client_ip, recorded_url.status, recorded_url.method, recorded_url.url.decode("utf-8"), recorded_url.mimetype, recorded_url.size, warcprox.digest_str(recorded_url.payload_digest, False).decode('utf-8')))
def put(self, recorded_url, block=True, timeout=None): logging.info("{} {} {} {} {} size={} {}".format( recorded_url.client_ip, recorded_url.status, recorded_url.method, recorded_url.url.decode("utf-8"), recorded_url.mimetype, recorded_url.size, warcprox.digest_str(recorded_url.response_recorder.payload_digest, False).decode('utf-8')))
def build_warc_record(self, url, warc_date=None, recorder=None, data=None, concurrent_to=None, warc_type=None, content_type=None, remote_ip=None, profile=None, refers_to=None, refers_to_target_uri=None, refers_to_date=None, payload_digest=None, truncated=None, content_length=None): if warc_date is None: warc_date = self.format_warc_date(datetime.datetime.utcnow()) record_id = warctools.WarcRecord.random_warc_uuid() headers = [] if warc_type is not None: headers.append((warctools.WarcRecord.TYPE, warc_type)) headers.append((warctools.WarcRecord.ID, record_id)) headers.append((warctools.WarcRecord.DATE, warc_date)) headers.append((warctools.WarcRecord.URL, url)) if remote_ip is not None: headers.append((warctools.WarcRecord.IP_ADDRESS, remote_ip)) if profile is not None: headers.append((warctools.WarcRecord.PROFILE, profile)) if refers_to is not None: headers.append((warctools.WarcRecord.REFERS_TO, refers_to)) if refers_to_target_uri is not None: headers.append((warctools.WarcRecord.REFERS_TO_TARGET_URI, refers_to_target_uri)) if refers_to_date is not None: headers.append( (warctools.WarcRecord.REFERS_TO_DATE, refers_to_date)) if concurrent_to is not None: headers.append((warctools.WarcRecord.CONCURRENT_TO, concurrent_to)) if content_type is not None: headers.append((warctools.WarcRecord.CONTENT_TYPE, content_type)) if payload_digest is not None: headers.append( (warctools.WarcRecord.PAYLOAD_DIGEST, payload_digest)) # truncated value may be 'length' or 'time' if truncated is not None: headers.append((b'WARC-Truncated', truncated)) if recorder is not None: if content_length is not None: headers.append((warctools.WarcRecord.CONTENT_LENGTH, str(content_length).encode('latin1'))) else: headers.append((warctools.WarcRecord.CONTENT_LENGTH, str(len(recorder)).encode('latin1'))) headers.append((warctools.WarcRecord.BLOCK_DIGEST, warcprox.digest_str(recorder.block_digest, self.base32))) recorder.tempfile.seek(0) record = warctools.WarcRecord(headers=headers, content_file=recorder.tempfile) else: if content_length is not None: headers.append((warctools.WarcRecord.CONTENT_LENGTH, str(content_length).encode('latin1'))) else: headers.append((warctools.WarcRecord.CONTENT_LENGTH, str(len(data)).encode('latin1'))) # no http headers so block digest == payload digest if not payload_digest: payload_digest = warcprox.digest_str( hashlib.new(self.digest_algorithm, data), self.base32) headers.append( (warctools.WarcRecord.PAYLOAD_DIGEST, payload_digest)) headers.append((warctools.WarcRecord.BLOCK_DIGEST, payload_digest)) if hasattr(data, 'read'): record = warctools.WarcRecord(headers=headers, content_file=data) else: content_tuple = content_type, data record = warctools.WarcRecord(headers=headers, content=content_tuple) return record