def init_chunks(self): """Fetch a list of all object keys from repository """ # Explicity set the initial hash table capacity to avoid performance issues # due to hash table "resonance" capacity = int(len(self.repository) * 1.2) self.chunks = ChunkIndex(capacity) marker = None while True: result = self.repository.list(limit=10000, marker=marker) if not result: break marker = result[-1] for id_ in result: self.chunks[id_] = (0, 0, 0)
def init_chunks(self): """Fetch a list of all object keys from repository """ # Explicity set the initial hash table capacity to avoid performance issues # due to hash table "resonance" capacity = int(len(self.repository) * 1.2) self.chunks = ChunkIndex.create(os.path.join(self.tmpdir, 'chunks').encode('utf-8'), capacity=capacity) marker = None while True: result = self.repository.list(limit=10000, marker=marker) if not result: break marker = result[-1] for id_ in result: self.chunks[id_] = (0, 0, 0)
class ArchiveChecker: def __init__(self): self.error_found = False self.possibly_superseded = set() self.tmpdir = tempfile.mkdtemp() def __del__(self): shutil.rmtree(self.tmpdir) def check(self, repository, repair=False, last=None): self.report_progress('Starting archive consistency check...') self.repair = repair self.repository = repository self.init_chunks() self.key = self.identify_key(repository) if Manifest.MANIFEST_ID not in self.chunks: self.manifest = self.rebuild_manifest() else: self.manifest, _ = Manifest.load(repository, key=self.key) self.rebuild_refcounts(last=last) if last is None: self.verify_chunks() else: self.report_progress('Orphaned objects check skipped (needs all archives checked)') if not self.error_found: self.report_progress('Archive consistency check complete, no problems found.') return self.repair or not self.error_found def init_chunks(self): """Fetch a list of all object keys from repository """ # Explicity set the initial hash table capacity to avoid performance issues # due to hash table "resonance" capacity = int(len(self.repository) * 1.2) self.chunks = ChunkIndex(capacity) marker = None while True: result = self.repository.list(limit=10000, marker=marker) if not result: break marker = result[-1] for id_ in result: self.chunks[id_] = (0, 0, 0) def report_progress(self, msg, error=False): if error: self.error_found = True print(msg, file=sys.stderr if error else sys.stdout) def identify_key(self, repository): cdata = repository.get(next(self.chunks.iteritems())[0]) return key_factory(repository, cdata) def rebuild_manifest(self): """Rebuild the manifest object if it is missing Iterates through all objects in the repository looking for archive metadata blocks. """ self.report_progress('Rebuilding missing manifest, this might take some time...', error=True) manifest = Manifest(self.key, self.repository) for chunk_id, _ in self.chunks.iteritems(): cdata = self.repository.get(chunk_id) data = self.key.decrypt(chunk_id, cdata) # Some basic sanity checks of the payload before feeding it into msgpack if len(data) < 2 or ((data[0] & 0xf0) != 0x80) or ((data[1] & 0xe0) != 0xa0): continue if b'cmdline' not in data or b'\xa7version\x01' not in data: continue try: archive = msgpack.unpackb(data) # Ignore exceptions that might be raised when feeding # msgpack with invalid data except (TypeError, ValueError, StopIteration): continue if isinstance(archive, dict) and b'items' in archive and b'cmdline' in archive: self.report_progress('Found archive ' + archive[b'name'].decode('utf-8'), error=True) manifest.archives[archive[b'name'].decode('utf-8')] = {b'id': chunk_id, b'time': archive[b'time']} self.report_progress('Manifest rebuild complete', error=True) return manifest def rebuild_refcounts(self, last=None): """Rebuild object reference counts by walking the metadata Missing and/or incorrect data is repaired when detected """ # Exclude the manifest from chunks del self.chunks[Manifest.MANIFEST_ID] def mark_as_possibly_superseded(id_): if self.chunks.get(id_, (0,))[0] == 0: self.possibly_superseded.add(id_) def add_callback(chunk): id_ = self.key.id_hash(chunk) cdata = self.key.encrypt(chunk) add_reference(id_, len(chunk), len(cdata), cdata) return id_ def add_reference(id_, size, csize, cdata=None): try: count, _, _ = self.chunks[id_] self.chunks[id_] = count + 1, size, csize except KeyError: assert cdata is not None self.chunks[id_] = 1, size, csize if self.repair: self.repository.put(id_, cdata) def verify_file_chunks(item): """Verifies that all file chunks are present Missing file chunks will be replaced with new chunks of the same length containing all zeros. """ offset = 0 chunk_list = [] for chunk_id, size, csize in item[b'chunks']: if chunk_id not in self.chunks: # If a file chunk is missing, create an all empty replacement chunk self.report_progress('{}: Missing file chunk detected (Byte {}-{})'.format(item[b'path'].decode('utf-8', 'surrogateescape'), offset, offset + size), error=True) data = bytes(size) chunk_id = self.key.id_hash(data) cdata = self.key.encrypt(data) csize = len(cdata) add_reference(chunk_id, size, csize, cdata) else: add_reference(chunk_id, size, csize) chunk_list.append((chunk_id, size, csize)) offset += size item[b'chunks'] = chunk_list def robust_iterator(archive): """Iterates through all archive items Missing item chunks will be skipped and the msgpack stream will be restarted """ unpacker = RobustUnpacker(lambda item: isinstance(item, dict) and b'path' in item) _state = 0 def missing_chunk_detector(chunk_id): nonlocal _state if _state % 2 != int(chunk_id not in self.chunks): _state += 1 return _state for state, items in groupby(archive[b'items'], missing_chunk_detector): items = list(items) if state % 2: self.report_progress('Archive metadata damage detected', error=True) continue if state > 0: unpacker.resync() for chunk_id, cdata in zip(items, repository.get_many(items)): unpacker.feed(self.key.decrypt(chunk_id, cdata)) for item in unpacker: yield item repository = cache_if_remote(self.repository) num_archives = len(self.manifest.archives) archive_items = sorted(self.manifest.archives.items(), reverse=True, key=lambda name_info: name_info[1][b'time']) end = None if last is None else min(num_archives, last) for i, (name, info) in enumerate(archive_items[:end]): self.report_progress('Analyzing archive {} ({}/{})'.format(name, num_archives - i, num_archives)) archive_id = info[b'id'] if archive_id not in self.chunks: self.report_progress('Archive metadata block is missing', error=True) del self.manifest.archives[name] continue mark_as_possibly_superseded(archive_id) cdata = self.repository.get(archive_id) data = self.key.decrypt(archive_id, cdata) archive = StableDict(msgpack.unpackb(data)) if archive[b'version'] != 1: raise Exception('Unknown archive metadata version') decode_dict(archive, (b'name', b'hostname', b'username', b'time')) # fixme: argv items_buffer = ChunkBuffer(self.key) items_buffer.write_chunk = add_callback for item in robust_iterator(archive): if b'chunks' in item: verify_file_chunks(item) items_buffer.add(item) items_buffer.flush(flush=True) for previous_item_id in archive[b'items']: mark_as_possibly_superseded(previous_item_id) archive[b'items'] = items_buffer.chunks data = msgpack.packb(archive, unicode_errors='surrogateescape') new_archive_id = self.key.id_hash(data) cdata = self.key.encrypt(data) add_reference(new_archive_id, len(data), len(cdata), cdata) info[b'id'] = new_archive_id def verify_chunks(self): unused = set() for id_, (count, size, csize) in self.chunks.iteritems(): if count == 0: unused.add(id_) orphaned = unused - self.possibly_superseded if orphaned: self.report_progress('{} orphaned objects found'.format(len(orphaned)), error=True) if self.repair: for id_ in unused: self.repository.delete(id_) self.manifest.write() self.repository.commit()
class ArchiveChecker: def __init__(self): self.error_found = False self.possibly_superseded = set() self.tmpdir = tempfile.mkdtemp() def __del__(self): shutil.rmtree(self.tmpdir) def check(self, repository, repair=False): self.report_progress('Starting archive consistency check...') self.repair = repair self.repository = repository self.init_chunks() self.key = self.identify_key(repository) if not Manifest.MANIFEST_ID in self.chunks: self.manifest = self.rebuild_manifest() else: self.manifest, _ = Manifest.load(repository, key=self.key) self.rebuild_refcounts() self.verify_chunks() if not self.error_found: self.report_progress( 'Archive consistency check complete, no problems found.') return self.repair or not self.error_found def init_chunks(self): """Fetch a list of all object keys from repository """ # Explicity set the initial hash table capacity to avoid performance issues # due to hash table "resonance" capacity = int(len(self.repository) * 1.2) self.chunks = ChunkIndex(capacity) marker = None while True: result = self.repository.list(limit=10000, marker=marker) if not result: break marker = result[-1] for id_ in result: self.chunks[id_] = (0, 0, 0) def report_progress(self, msg, error=False): if error: self.error_found = True print(msg, file=sys.stderr if error else sys.stdout) def identify_key(self, repository): cdata = repository.get(next(self.chunks.iteritems())[0]) return key_factory(repository, cdata) def rebuild_manifest(self): """Rebuild the manifest object if it is missing Iterates through all objects in the repository looking for archive metadata blocks. """ self.report_progress( 'Rebuilding missing manifest, this might take some time...', error=True) manifest = Manifest(self.key, self.repository) for chunk_id, _ in self.chunks.iteritems(): cdata = self.repository.get(chunk_id) data = self.key.decrypt(chunk_id, cdata) # Some basic sanity checks of the payload before feeding it into msgpack if len(data) < 2 or ((data[0] & 0xf0) != 0x80) or ( (data[1] & 0xe0) != 0xa0): continue if not b'cmdline' in data or not b'\xa7version\x01' in data: continue try: archive = msgpack.unpackb(data) except: continue if isinstance( archive, dict) and b'items' in archive and b'cmdline' in archive: self.report_progress('Found archive ' + archive[b'name'].decode('utf-8'), error=True) manifest.archives[archive[b'name'].decode('utf-8')] = { b'id': chunk_id, b'time': archive[b'time'] } self.report_progress('Manifest rebuild complete', error=True) return manifest def rebuild_refcounts(self): """Rebuild object reference counts by walking the metadata Missing and/or incorrect data is repaired when detected """ # Exclude the manifest from chunks del self.chunks[Manifest.MANIFEST_ID] def mark_as_possibly_superseded(id_): if self.chunks.get(id_, (0, ))[0] == 0: self.possibly_superseded.add(id_) def add_callback(chunk): id_ = self.key.id_hash(chunk) cdata = self.key.encrypt(chunk) add_reference(id_, len(chunk), len(cdata), cdata) return id_ def add_reference(id_, size, csize, cdata=None): try: count, _, _ = self.chunks[id_] self.chunks[id_] = count + 1, size, csize except KeyError: assert cdata is not None self.chunks[id_] = 1, size, csize if self.repair: self.repository.put(id_, cdata) def verify_file_chunks(item): """Verifies that all file chunks are present Missing file chunks will be replaced with new chunks of the same length containing all zeros. """ offset = 0 chunk_list = [] for chunk_id, size, csize in item[b'chunks']: if not chunk_id in self.chunks: # If a file chunk is missing, create an all empty replacement chunk self.report_progress( '{}: Missing file chunk detected (Byte {}-{})'.format( item[b'path'].decode('utf-8', 'surrogateescape'), offset, offset + size), error=True) data = bytes(size) chunk_id = self.key.id_hash(data) cdata = self.key.encrypt(data) csize = len(cdata) add_reference(chunk_id, size, csize, cdata) else: add_reference(chunk_id, size, csize) chunk_list.append((chunk_id, size, csize)) offset += size item[b'chunks'] = chunk_list def robust_iterator(archive): """Iterates through all archive items Missing item chunks will be skipped and the msgpack stream will be restarted """ unpacker = RobustUnpacker( lambda item: isinstance(item, dict) and b'path' in item) _state = 0 def missing_chunk_detector(chunk_id): nonlocal _state if _state % 2 != int(not chunk_id in self.chunks): _state += 1 return _state for state, items in groupby(archive[b'items'], missing_chunk_detector): items = list(items) if state % 2: self.report_progress('Archive metadata damage detected', error=True) continue if state > 0: unpacker.resync() for chunk_id, cdata in zip(items, repository.get_many(items)): unpacker.feed(self.key.decrypt(chunk_id, cdata)) for item in unpacker: yield item repository = cache_if_remote(self.repository) num_archives = len(self.manifest.archives) for i, (name, info) in enumerate(list(self.manifest.archives.items()), 1): self.report_progress('Analyzing archive {} ({}/{})'.format( name, i, num_archives)) archive_id = info[b'id'] if not archive_id in self.chunks: self.report_progress('Archive metadata block is missing', error=True) del self.manifest.archives[name] continue mark_as_possibly_superseded(archive_id) cdata = self.repository.get(archive_id) data = self.key.decrypt(archive_id, cdata) archive = StableDict(msgpack.unpackb(data)) if archive[b'version'] != 1: raise Exception('Unknown archive metadata version') decode_dict( archive, (b'name', b'hostname', b'username', b'time')) # fixme: argv items_buffer = ChunkBuffer(self.key) items_buffer.write_chunk = add_callback for item in robust_iterator(archive): if b'chunks' in item: verify_file_chunks(item) items_buffer.add(item) items_buffer.flush(flush=True) for previous_item_id in archive[b'items']: mark_as_possibly_superseded(previous_item_id) archive[b'items'] = items_buffer.chunks data = msgpack.packb(archive, unicode_errors='surrogateescape') new_archive_id = self.key.id_hash(data) cdata = self.key.encrypt(data) add_reference(new_archive_id, len(data), len(cdata), cdata) info[b'id'] = new_archive_id def verify_chunks(self): unused = set() for id_, (count, size, csize) in self.chunks.iteritems(): if count == 0: unused.add(id_) orphaned = unused - self.possibly_superseded if orphaned: self.report_progress('{} orphaned objects found'.format( len(orphaned)), error=True) if self.repair: for id_ in unused: self.repository.delete(id_) self.manifest.write() self.repository.commit()