def load(self, id): self.id = id data = self.key.decrypt(self.id, self.repository.get(self.id)) self.metadata = msgpack.unpackb(data) if self.metadata[b'version'] != 1: raise Exception('Unknown archive metadata version') decode_dict(self.metadata, (b'name', b'hostname', b'username', b'time')) self.metadata[b'cmdline'] = [arg.decode('utf-8', 'surrogateescape') for arg in self.metadata[b'cmdline']] self.name = self.metadata[b'name']
def load(self, id): self.id = id data = self.key.decrypt(self.id, self.repository.get(self.id)) self.metadata = msgpack.unpackb(data) if self.metadata[b"version"] != 1: raise Exception("Unknown archive metadata version") decode_dict(self.metadata, (b"name", b"hostname", b"username", b"time")) self.metadata[b"cmdline"] = [arg.decode("utf-8", "surrogateescape") for arg in self.metadata[b"cmdline"]] self.name = self.metadata[b"name"]
def unpack_many(self, ids, filter=None, preload=False): unpacker = msgpack.Unpacker(use_list=False) for data in self.fetch_many(ids): unpacker.feed(data) items = [decode_dict(item, (b'path', b'source', b'user', b'group')) for item in unpacker] if filter: items = [item for item in items if filter(item)] if preload: for item in items: if b'chunks' in item: self.repository.preload([c[0] for c in item[b'chunks']]) for item in items: yield item
def unpack_many(self, ids, filter=None, preload=False): unpacker = msgpack.Unpacker(use_list=False) for data in self.fetch_many(ids): unpacker.feed(data) items = [decode_dict(item, (b"path", b"source", b"user", b"group")) for item in unpacker] if filter: items = [item for item in items if filter(item)] if preload: for item in items: if b"chunks" in item: self.repository.preload([c[0] for c in item[b"chunks"]]) for item in items: yield item
def rebuild_refcounts(self): """Rebuild object reference counts by walking the metadata Missing and/or incorrect data is repaired when detected """ # Exclude the manifest from chunks del self.chunks[Manifest.MANIFEST_ID] def mark_as_possibly_superseded(id_): if self.chunks.get(id_, (0,))[0] == 0: self.possibly_superseded.add(id_) def add_callback(chunk): id_ = self.key.id_hash(chunk) cdata = self.key.encrypt(chunk) add_reference(id_, len(chunk), len(cdata), cdata) return id_ def add_reference(id_, size, csize, cdata=None): try: count, _, _ = self.chunks[id_] self.chunks[id_] = count + 1, size, csize except KeyError: assert cdata is not None self.chunks[id_] = 1, size, csize if self.repair: self.repository.put(id_, cdata) def verify_file_chunks(item): """Verifies that all file chunks are present Missing file chunks will be replaced with new chunks of the same length containing all zeros. """ offset = 0 chunk_list = [] for chunk_id, size, csize in item[b'chunks']: if not chunk_id in self.chunks: # If a file chunk is missing, create an all empty replacement chunk self.report_progress('{}: Missing file chunk detected (Byte {}-{})'.format(item[b'path'].decode('utf-8', 'surrogateescape'), offset, offset + size), error=True) data = bytes(size) chunk_id = self.key.id_hash(data) cdata = self.key.encrypt(data) csize = len(cdata) add_reference(chunk_id, size, csize, cdata) else: add_reference(chunk_id, size, csize) chunk_list.append((chunk_id, size, csize)) offset += size item[b'chunks'] = chunk_list def robust_iterator(archive): """Iterates through all archive items Missing item chunks will be skipped and the msgpack stream will be restarted """ unpacker = RobustUnpacker(lambda item: isinstance(item, dict) and b'path' in item) _state = 0 def missing_chunk_detector(chunk_id): nonlocal _state if _state % 2 != int(not chunk_id in self.chunks): _state += 1 return _state for state, items in groupby(archive[b'items'], missing_chunk_detector): items = list(items) if state % 2: self.report_progress('Archive metadata damage detected', error=True) continue if state > 0: unpacker.resync() for chunk_id, cdata in zip(items, repository.get_many(items)): unpacker.feed(self.key.decrypt(chunk_id, cdata)) for item in unpacker: yield item if isinstance(self.repository, RemoteRepository): repository = RepositoryCache(self.repository) else: repository = self.repository num_archives = len(self.manifest.archives) for i, (name, info) in enumerate(list(self.manifest.archives.items()), 1): self.report_progress('Analyzing archive {} ({}/{})'.format(name, i, num_archives)) archive_id = info[b'id'] if not archive_id in self.chunks: self.report_progress('Archive metadata block is missing', error=True) del self.manifest.archives[name] continue mark_as_possibly_superseded(archive_id) cdata = self.repository.get(archive_id) data = self.key.decrypt(archive_id, cdata) archive = StableDict(msgpack.unpackb(data)) if archive[b'version'] != 1: raise Exception('Unknown archive metadata version') decode_dict(archive, (b'name', b'hostname', b'username', b'time')) # fixme: argv items_buffer = ChunkBuffer(self.key) items_buffer.write_chunk = add_callback for item in robust_iterator(archive): if b'chunks' in item: verify_file_chunks(item) items_buffer.add(item) items_buffer.flush(flush=True) for previous_item_id in archive[b'items']: mark_as_possibly_superseded(previous_item_id) archive[b'items'] = items_buffer.chunks data = msgpack.packb(archive, unicode_errors='surrogateescape') new_archive_id = self.key.id_hash(data) cdata = self.key.encrypt(data) add_reference(new_archive_id, len(data), len(cdata), cdata) info[b'id'] = new_archive_id
def load(self, id): self.id = id self.metadata = self._load_meta(self.id) decode_dict(self.metadata, (b'name', b'hostname', b'username', b'time')) self.metadata[b'cmdline'] = [arg.decode('utf-8', 'surrogateescape') for arg in self.metadata[b'cmdline']] self.name = self.metadata[b'name']
def rebuild_refcounts(self): """Rebuild object reference counts by walking the metadata Missing and/or incorrect data is repaired when detected """ # Exclude the manifest from chunks del self.chunks[Manifest.MANIFEST_ID] def mark_as_possibly_superseded(id_): if self.chunks.get(id_, (0,))[0] == 0: self.possibly_superseded.add(id_) def add_callback(chunk): id_ = self.key.id_hash(chunk) cdata = self.key.encrypt(chunk) add_reference(id_, len(chunk), len(cdata), cdata) return id_ def add_reference(id_, size, csize, cdata=None): try: count, _, _ = self.chunks[id_] self.chunks[id_] = count + 1, size, csize except KeyError: assert cdata is not None self.chunks[id_] = 1, size, csize if self.repair: self.repository.put(id_, cdata) def verify_file_chunks(item): """Verifies that all file chunks are present Missing file chunks will be replaced with new chunks of the same length containing all zeros. """ offset = 0 chunk_list = [] for chunk_id, size, csize in item[b'chunks']: if not chunk_id in self.chunks: # If a file chunk is missing, create an all empty replacement chunk self.report_progress('{}: Missing file chunk detected (Byte {}-{})'.format(item[b'path'].decode('utf-8', 'surrogateescape'), offset, offset + size), error=True) data = bytes(size) chunk_id = self.key.id_hash(data) cdata = self.key.encrypt(data) csize = len(cdata) add_reference(chunk_id, size, csize, cdata) else: add_reference(chunk_id, size, csize) chunk_list.append((chunk_id, size, csize)) offset += size item[b'chunks'] = chunk_list def robust_iterator(archive): """Iterates through all archive items Missing item chunks will be skipped and the msgpack stream will be restarted """ unpacker = RobustUnpacker(lambda item: isinstance(item, dict) and b'path' in item) _state = 0 def missing_chunk_detector(chunk_id): nonlocal _state if _state % 2 != int(not chunk_id in self.chunks): _state += 1 return _state for state, items in groupby(archive[b'items'], missing_chunk_detector): items = list(items) if state % 2: self.report_progress('Archive metadata damage detected', error=True) continue if state > 0: unpacker.resync() for chunk_id, cdata in zip(items, repository.get_many(items)): unpacker.feed(self.key.decrypt(chunk_id, cdata)) for item in unpacker: yield item repository = cache_if_remote(self.repository) num_archives = len(self.manifest.archives) for i, (name, info) in enumerate(list(self.manifest.archives.items()), 1): self.report_progress('Analyzing archive {} ({}/{})'.format(name, i, num_archives)) archive_id = info[b'id'] if not archive_id in self.chunks: self.report_progress('Archive metadata block is missing', error=True) del self.manifest.archives[name] continue mark_as_possibly_superseded(archive_id) cdata = self.repository.get(archive_id) data = self.key.decrypt(archive_id, cdata) archive = StableDict(msgpack.unpackb(data)) if archive[b'version'] != 1: raise Exception('Unknown archive metadata version') decode_dict(archive, (b'name', b'hostname', b'username', b'time')) # fixme: argv items_buffer = ChunkBuffer(self.key) items_buffer.write_chunk = add_callback for item in robust_iterator(archive): if b'chunks' in item: verify_file_chunks(item) items_buffer.add(item) items_buffer.flush(flush=True) for previous_item_id in archive[b'items']: mark_as_possibly_superseded(previous_item_id) archive[b'items'] = items_buffer.chunks data = msgpack.packb(archive, unicode_errors='surrogateescape') new_archive_id = self.key.id_hash(data) cdata = self.key.encrypt(data) add_reference(new_archive_id, len(data), len(cdata), cdata) info[b'id'] = new_archive_id
def get_next(self): while True: n = next(self.unpacker) decode_dict(n, (b"path", b"source", b"user", b"group")) if not self.filter or self.filter(n): return n