def build_warc_record(self, url, warc_date=None, recorder=None, data=None, concurrent_to=None, warc_type=None, content_type=None, remote_ip=None, profile=None, refers_to=None, refers_to_target_uri=None, refers_to_date=None, payload_digest=None): if warc_date is None: warc_date = warctools.warc.warc_datetime_str(datetime.datetime.utcnow()) record_id = warctools.WarcRecord.random_warc_uuid() headers = [] if warc_type is not None: headers.append((warctools.WarcRecord.TYPE, warc_type)) headers.append((warctools.WarcRecord.ID, record_id)) headers.append((warctools.WarcRecord.DATE, warc_date)) headers.append((warctools.WarcRecord.URL, url)) if remote_ip is not None: headers.append((warctools.WarcRecord.IP_ADDRESS, remote_ip)) if profile is not None: headers.append((warctools.WarcRecord.PROFILE, profile)) if refers_to is not None: headers.append((warctools.WarcRecord.REFERS_TO, refers_to)) if refers_to_target_uri is not None: headers.append((warctools.WarcRecord.REFERS_TO_TARGET_URI, refers_to_target_uri)) if refers_to_date is not None: headers.append((warctools.WarcRecord.REFERS_TO_DATE, refers_to_date)) if concurrent_to is not None: headers.append((warctools.WarcRecord.CONCURRENT_TO, concurrent_to)) if content_type is not None: headers.append((warctools.WarcRecord.CONTENT_TYPE, content_type)) if payload_digest is not None: headers.append((warctools.WarcRecord.PAYLOAD_DIGEST, payload_digest)) if recorder is not None: headers.append((warctools.WarcRecord.CONTENT_LENGTH, str(len(recorder)).encode('latin1'))) headers.append((warctools.WarcRecord.BLOCK_DIGEST, warcprox.digest_str(recorder.block_digest, self.base32))) if recorder.payload_digest is not None: headers.append((warctools.WarcRecord.PAYLOAD_DIGEST, warcprox.digest_str(recorder.payload_digest, self.base32))) recorder.tempfile.seek(0) record = warctools.WarcRecord(headers=headers, content_file=recorder.tempfile) else: headers.append((warctools.WarcRecord.CONTENT_LENGTH, str(len(data)).encode('latin1'))) digest = hashlib.new(self.digest_algorithm, data) headers.append((warctools.WarcRecord.BLOCK_DIGEST, warcprox.digest_str(digest, self.base32))) if not payload_digest: headers.append((warctools.WarcRecord.PAYLOAD_DIGEST, warcprox.digest_str(digest, self.base32))) content_tuple = content_type, data record = warctools.WarcRecord(headers=headers, content=content_tuple) return record
def build_warc_record(self, url, warc_date=None, content_buffer=None, content_file=None, content_length=None, concurrent_to=None, warc_type=None, content_type=None, remote_ip=None, profile=None, refers_to=None, refers_to_target_uri=None, refers_to_date=None, record_id=None, block_digest=None, payload_digest=None): if warc_date is None: warc_date = warctools.warc.warc_datetime_str(datetime.now()) if record_id is None: record_id = warctools.WarcRecord.random_warc_uuid() headers = [] if warc_type is not None: headers.append((warctools.WarcRecord.TYPE, warc_type)) headers.append((warctools.WarcRecord.ID, record_id)) headers.append((warctools.WarcRecord.DATE, warc_date)) headers.append((warctools.WarcRecord.URL, url)) if remote_ip is not None: headers.append((warctools.WarcRecord.IP_ADDRESS, remote_ip)) if profile is not None: headers.append((warctools.WarcRecord.PROFILE, profile)) if refers_to is not None: headers.append((warctools.WarcRecord.REFERS_TO, refers_to)) if refers_to_target_uri is not None: headers.append((warctools.WarcRecord.REFERS_TO_TARGET_URI, refers_to_target_uri)) if refers_to_date is not None: headers.append((warctools.WarcRecord.REFERS_TO_DATE, refers_to_date)) if concurrent_to is not None: headers.append((warctools.WarcRecord.CONCURRENT_TO, concurrent_to)) if content_type is not None: headers.append((warctools.WarcRecord.CONTENT_TYPE, content_type)) if content_length is not None: headers.append((warctools.WarcRecord.CONTENT_LENGTH, content_length)) if block_digest is not None: headers.append((warctools.WarcRecord.BLOCK_DIGEST, block_digest)) if payload_digest is not None: headers.append((warctools.WarcRecord.BLOCK_DIGEST, payload_digest)) if content_file is not None: assert content_buffer is None assert content_length is not None record = warctools.WarcRecord(headers=headers, content_file=content_file) else: assert content_buffer is not None content_tuple = (content_type, content_buffer) record = warctools.WarcRecord(headers=headers, content=content_tuple) return record
def write_warc_record(self, record_type, url, data, content_type, warc_date=None, out_file=None, extra_headers=None): # set default date and convert to string if necessary warc_date = warc_date or timezone.now() if hasattr(warc_date, 'isoformat'): warc_date = warctools.warc.warc_datetime_str(warc_date) close_file = not out_file out_file = out_file or self.open_warc_for_writing() headers = [(warctools.WarcRecord.TYPE, record_type), (warctools.WarcRecord.ID, warctools.WarcRecord.random_warc_uuid()), (warctools.WarcRecord.DATE, warc_date), (warctools.WarcRecord.URL, url), (warctools.WarcRecord.BLOCK_DIGEST, b'sha1:%s' % hashlib.sha1(data).hexdigest())] if extra_headers: headers.extend(extra_headers) record = warctools.WarcRecord(headers=headers, content=(content_type, data)) record.write_to(out_file, gzip=True) if close_file: self.close_warc_after_writing(out_file) return headers
def build_warcinfo_record(self, filename): warc_record_date = self.format_warc_date(datetime.datetime.utcnow()) record_id = warctools.WarcRecord.random_warc_uuid() headers = [] headers.append((warctools.WarcRecord.ID, record_id)) headers.append( (warctools.WarcRecord.TYPE, warctools.WarcRecord.WARCINFO)) headers.append( (warctools.WarcRecord.FILENAME, filename.encode('latin1'))) headers.append((warctools.WarcRecord.DATE, warc_record_date)) warcinfo_fields = [] warcinfo_fields.append(b'software: warcprox ' + warcprox.__version__.encode('latin1')) hostname = socket.gethostname() warcinfo_fields.append( 'hostname: {}'.format(hostname).encode('latin1')) warcinfo_fields.append( ('ip: %s' % self._local_address()).encode('latin1')) warcinfo_fields.append(b'format: WARC File Format 1.0') # warcinfo_fields.append('robots: ignore') # warcinfo_fields.append('description: {0}'.format(self.description)) # warcinfo_fields.append('isPartOf: {0}'.format(self.is_part_of)) data = b'\r\n'.join(warcinfo_fields) + b'\r\n' record = warctools.WarcRecord(headers=headers, content=(b'application/warc-fields', data)) return record
def write_perma_warc_header(out_file, guid, timestamp): # build warcinfo header headers = [ (warctools.WarcRecord.ID, warctools.WarcRecord.random_warc_uuid()), (warctools.WarcRecord.TYPE, warctools.WarcRecord.WARCINFO), (warctools.WarcRecord.DATE, warctools.warc.warc_datetime_str(timestamp)) ] warcinfo_fields = [ b'operator: Perma.cc', b'format: WARC File Format 1.0', bytes('Perma-GUID: {}'.format(guid), 'utf-8') ] data = b'\r\n'.join(warcinfo_fields) + b'\r\n' warcinfo_record = warctools.WarcRecord(headers=headers, content=(b'application/warc-fields', data)) warcinfo_record.write_to(out_file, gzip=True)
def write_resource_record_from_asset(data, url, content_type, out_file, extra_headers=None): """ Constructs a single WARC resource record from an asset (screenshot, uploaded file, etc.) and writes to out_file. """ warc_date = warctools.warc.warc_datetime_str(timezone.now()).replace(b'+00:00Z', b'Z') headers = [ (warctools.WarcRecord.TYPE, warctools.WarcRecord.RESOURCE), (warctools.WarcRecord.ID, warctools.WarcRecord.random_warc_uuid()), (warctools.WarcRecord.DATE, warc_date), (warctools.WarcRecord.URL, bytes(url, 'utf-8')), (warctools.WarcRecord.BLOCK_DIGEST, bytes('sha1:{}'.format(hashlib.sha1(data).hexdigest()), 'utf-8')) ] if extra_headers: headers.extend(extra_headers) record = warctools.WarcRecord(headers=headers, content=(bytes(content_type, 'utf-8'), data)) record.write_to(out_file, gzip=True)
def write_record(self, headers, content_type, content): ''' write WARC record (of any type) to WARC GZ file :param headers list of header tuples [('foo', 'bar')] :param content_type WARC Content-Type header string :param content WARC payload ''' self.bump_serial(sys.getsizeof(content)) bheaders = [] for key, val in headers: bheaders.append((_bytes(key), _bytes(val))) with open(self.warc_fname, 'ab') as _fh: record = warctools.WarcRecord(headers=bheaders, content=(_bytes(content_type), _bytes(content))) record.write_to(_fh, gzip=True) self.log.info('Wrote %s bytes (%s) to file: %s', _fh.tell(), content_type, self.warc_fname)
def build_warc_record(self, url, warc_date=None, recorder=None, data=None, concurrent_to=None, warc_type=None, content_type=None, remote_ip=None, profile=None, refers_to=None, refers_to_target_uri=None, refers_to_date=None, payload_digest=None, truncated=None, content_length=None): if warc_date is None: warc_date = self.format_warc_date(datetime.datetime.utcnow()) record_id = warctools.WarcRecord.random_warc_uuid() headers = [] if warc_type is not None: headers.append((warctools.WarcRecord.TYPE, warc_type)) headers.append((warctools.WarcRecord.ID, record_id)) headers.append((warctools.WarcRecord.DATE, warc_date)) headers.append((warctools.WarcRecord.URL, url)) if remote_ip is not None: headers.append((warctools.WarcRecord.IP_ADDRESS, remote_ip)) if profile is not None: headers.append((warctools.WarcRecord.PROFILE, profile)) if refers_to is not None: headers.append((warctools.WarcRecord.REFERS_TO, refers_to)) if refers_to_target_uri is not None: headers.append((warctools.WarcRecord.REFERS_TO_TARGET_URI, refers_to_target_uri)) if refers_to_date is not None: headers.append( (warctools.WarcRecord.REFERS_TO_DATE, refers_to_date)) if concurrent_to is not None: headers.append((warctools.WarcRecord.CONCURRENT_TO, concurrent_to)) if content_type is not None: headers.append((warctools.WarcRecord.CONTENT_TYPE, content_type)) if payload_digest is not None: headers.append( (warctools.WarcRecord.PAYLOAD_DIGEST, payload_digest)) # truncated value may be 'length' or 'time' if truncated is not None: headers.append((b'WARC-Truncated', truncated)) if recorder is not None: if content_length is not None: headers.append((warctools.WarcRecord.CONTENT_LENGTH, str(content_length).encode('latin1'))) else: headers.append((warctools.WarcRecord.CONTENT_LENGTH, str(len(recorder)).encode('latin1'))) headers.append((warctools.WarcRecord.BLOCK_DIGEST, warcprox.digest_str(recorder.block_digest, self.base32))) recorder.tempfile.seek(0) record = warctools.WarcRecord(headers=headers, content_file=recorder.tempfile) else: if content_length is not None: headers.append((warctools.WarcRecord.CONTENT_LENGTH, str(content_length).encode('latin1'))) else: headers.append((warctools.WarcRecord.CONTENT_LENGTH, str(len(data)).encode('latin1'))) # no http headers so block digest == payload digest if not payload_digest: payload_digest = warcprox.digest_str( hashlib.new(self.digest_algorithm, data), self.base32) headers.append( (warctools.WarcRecord.PAYLOAD_DIGEST, payload_digest)) headers.append((warctools.WarcRecord.BLOCK_DIGEST, payload_digest)) if hasattr(data, 'read'): record = warctools.WarcRecord(headers=headers, content_file=data) else: content_tuple = content_type, data record = warctools.WarcRecord(headers=headers, content=content_tuple) return record