def sync(self): """Initializes cache by fetching and reading all archive indicies """ def add(id, size, csize): try: count, size, csize = self.chunks[id] self.chunks[id] = count + 1, size, csize except KeyError: self.chunks[id] = 1, size, csize self.begin_txn() print('Initializing cache...') self.chunks.clear() unpacker = msgpack.Unpacker() repository = cache_if_remote(self.repository) for name, info in self.manifest.archives.items(): archive_id = info[b'id'] cdata = repository.get(archive_id) data = self.key.decrypt(archive_id, cdata) add(archive_id, len(data), len(cdata)) archive = msgpack.unpackb(data) if archive[b'version'] != 1: raise Exception('Unknown archive metadata version') decode_dict(archive, (b'name',)) print('Analyzing archive:', archive[b'name']) for key, chunk in zip(archive[b'items'], repository.get_many(archive[b'items'])): data = self.key.decrypt(key, chunk) add(key, len(data), len(chunk)) unpacker.feed(data) for item in unpacker: if b'chunks' in item: for chunk_id, size, csize in item[b'chunks']: add(chunk_id, size, csize)
def __init__(self, key, repository, manifest, archive): super(AtticOperations, self).__init__() self._inode_count = 0 self.key = key self.repository = cache_if_remote(repository) self.items = {} self.parent = {} self.contents = defaultdict(dict) self.default_dir = { b'mode': 0o40755, b'mtime': int(time.time() * 1e9), b'uid': os.getuid(), b'gid': os.getgid() } self.pending_archives = {} self.cache = ItemCache() if archive: self.process_archive(archive) else: # Create root inode self.parent[1] = self.allocate_inode() self.items[1] = self.default_dir for archive_name in manifest.archives: # Create archive placeholder inode archive_inode = self.allocate_inode() self.items[archive_inode] = self.default_dir self.parent[archive_inode] = 1 self.contents[1][os.fsencode(archive_name)] = archive_inode self.pending_archives[archive_inode] = Archive( repository, key, manifest, archive_name)
def sync(self): """Initializes cache by fetching and reading all archive indicies """ def add(id, size, csize): try: count, size, csize = self.chunks[id] self.chunks[id] = count + 1, size, csize except KeyError: self.chunks[id] = 1, size, csize self.begin_txn() print('Initializing cache...') self.chunks.clear() unpacker = msgpack.Unpacker() repository = cache_if_remote(self.repository) for name, info in self.manifest.archives.items(): archive_id = info[b'id'] cdata = repository.get(archive_id) data = self.key.decrypt(archive_id, cdata) add(archive_id, len(data), len(cdata)) archive = msgpack.unpackb(data) if archive[b'version'] != 1: raise Exception('Unknown archive metadata version') decode_dict(archive, (b'name', )) print('Analyzing archive:', archive[b'name']) for key, chunk in zip(archive[b'items'], repository.get_many(archive[b'items'])): data = self.key.decrypt(key, chunk) add(key, len(data), len(chunk)) unpacker.feed(data) for item in unpacker: if b'chunks' in item: for chunk_id, size, csize in item[b'chunks']: add(chunk_id, size, csize)
def __init__(self, key, repository, manifest, archive): super(FuseOperations, self).__init__() self._inode_count = 0 self.key = key self.repository = cache_if_remote(repository) self.items = {} self.parent = {} self.contents = defaultdict(dict) self.default_dir = {b'mode': 0o40755, b'mtime': int(time.time() * 1e9), b'uid': os.getuid(), b'gid': os.getgid()} self.pending_archives = {} self.accounted_chunks = {} self.cache = ItemCache() if archive: self.process_archive(archive) else: # Create root inode self.parent[1] = self.allocate_inode() self.items[1] = self.default_dir for archive_name in manifest.archives: # Create archive placeholder inode archive_inode = self.allocate_inode() self.items[archive_inode] = self.default_dir self.parent[archive_inode] = 1 self.contents[1][os.fsencode(archive_name)] = archive_inode self.pending_archives[archive_inode] = Archive(repository, key, manifest, archive_name)
def rebuild_refcounts(self, last=None): """Rebuild object reference counts by walking the metadata Missing and/or incorrect data is repaired when detected """ # Exclude the manifest from chunks del self.chunks[Manifest.MANIFEST_ID] def mark_as_possibly_superseded(id_): if self.chunks.get(id_, (0,))[0] == 0: self.possibly_superseded.add(id_) def add_callback(chunk): id_ = self.key.id_hash(chunk) cdata = self.key.encrypt(chunk) add_reference(id_, len(chunk), len(cdata), cdata) return id_ def add_reference(id_, size, csize, cdata=None): try: count, _, _ = self.chunks[id_] self.chunks[id_] = count + 1, size, csize except KeyError: assert cdata is not None self.chunks[id_] = 1, size, csize if self.repair: self.repository.put(id_, cdata) def verify_file_chunks(item): """Verifies that all file chunks are present Missing file chunks will be replaced with new chunks of the same length containing all zeros. """ offset = 0 chunk_list = [] for chunk_id, size, csize in item[b'chunks']: if chunk_id not in self.chunks: # If a file chunk is missing, create an all empty replacement chunk self.report_progress('{}: Missing file chunk detected (Byte {}-{})'.format(item[b'path'].decode('utf-8', 'surrogateescape'), offset, offset + size), error=True) data = bytes(size) chunk_id = self.key.id_hash(data) cdata = self.key.encrypt(data) csize = len(cdata) add_reference(chunk_id, size, csize, cdata) else: add_reference(chunk_id, size, csize) chunk_list.append((chunk_id, size, csize)) offset += size item[b'chunks'] = chunk_list def robust_iterator(archive): """Iterates through all archive items Missing item chunks will be skipped and the msgpack stream will be restarted """ unpacker = RobustUnpacker(lambda item: isinstance(item, dict) and b'path' in item) _state = 0 def missing_chunk_detector(chunk_id): nonlocal _state if _state % 2 != int(chunk_id not in self.chunks): _state += 1 return _state for state, items in groupby(archive[b'items'], missing_chunk_detector): items = list(items) if state % 2: self.report_progress('Archive metadata damage detected', error=True) continue if state > 0: unpacker.resync() for chunk_id, cdata in zip(items, repository.get_many(items)): unpacker.feed(self.key.decrypt(chunk_id, cdata)) for item in unpacker: yield item repository = cache_if_remote(self.repository) num_archives = len(self.manifest.archives) archive_items = sorted(self.manifest.archives.items(), reverse=True, key=lambda name_info: name_info[1][b'time']) end = None if last is None else min(num_archives, last) for i, (name, info) in enumerate(archive_items[:end]): self.report_progress('Analyzing archive {} ({}/{})'.format(name, num_archives - i, num_archives)) archive_id = info[b'id'] if archive_id not in self.chunks: self.report_progress('Archive metadata block is missing', error=True) del self.manifest.archives[name] continue mark_as_possibly_superseded(archive_id) cdata = self.repository.get(archive_id) data = self.key.decrypt(archive_id, cdata) archive = StableDict(msgpack.unpackb(data)) if archive[b'version'] != 1: raise Exception('Unknown archive metadata version') decode_dict(archive, (b'name', b'hostname', b'username', b'time')) # fixme: argv items_buffer = ChunkBuffer(self.key) items_buffer.write_chunk = add_callback for item in robust_iterator(archive): if b'chunks' in item: verify_file_chunks(item) items_buffer.add(item) items_buffer.flush(flush=True) for previous_item_id in archive[b'items']: mark_as_possibly_superseded(previous_item_id) archive[b'items'] = items_buffer.chunks data = msgpack.packb(archive, unicode_errors='surrogateescape') new_archive_id = self.key.id_hash(data) cdata = self.key.encrypt(data) add_reference(new_archive_id, len(data), len(cdata), cdata) info[b'id'] = new_archive_id
def rebuild_refcounts(self): """Rebuild object reference counts by walking the metadata Missing and/or incorrect data is repaired when detected """ # Exclude the manifest from chunks del self.chunks[Manifest.MANIFEST_ID] def mark_as_possibly_superseded(id_): if self.chunks.get(id_, (0,))[0] == 0: self.possibly_superseded.add(id_) def add_callback(chunk): id_ = self.key.id_hash(chunk) cdata = self.key.encrypt(chunk) add_reference(id_, len(chunk), len(cdata), cdata) return id_ def add_reference(id_, size, csize, cdata=None): try: count, _, _ = self.chunks[id_] self.chunks[id_] = count + 1, size, csize except KeyError: assert cdata is not None self.chunks[id_] = 1, size, csize if self.repair: self.repository.put(id_, cdata) def verify_file_chunks(item): """Verifies that all file chunks are present Missing file chunks will be replaced with new chunks of the same length containing all zeros. """ offset = 0 chunk_list = [] for chunk_id, size, csize in item[b'chunks']: if not chunk_id in self.chunks: # If a file chunk is missing, create an all empty replacement chunk self.report_progress('{}: Missing file chunk detected (Byte {}-{})'.format(item[b'path'].decode('utf-8', 'surrogateescape'), offset, offset + size), error=True) data = bytes(size) chunk_id = self.key.id_hash(data) cdata = self.key.encrypt(data) csize = len(cdata) add_reference(chunk_id, size, csize, cdata) else: add_reference(chunk_id, size, csize) chunk_list.append((chunk_id, size, csize)) offset += size item[b'chunks'] = chunk_list def robust_iterator(archive): """Iterates through all archive items Missing item chunks will be skipped and the msgpack stream will be restarted """ unpacker = RobustUnpacker(lambda item: isinstance(item, dict) and b'path' in item) _state = 0 def missing_chunk_detector(chunk_id): nonlocal _state if _state % 2 != int(not chunk_id in self.chunks): _state += 1 return _state for state, items in groupby(archive[b'items'], missing_chunk_detector): items = list(items) if state % 2: self.report_progress('Archive metadata damage detected', error=True) continue if state > 0: unpacker.resync() for chunk_id, cdata in zip(items, repository.get_many(items)): unpacker.feed(self.key.decrypt(chunk_id, cdata)) for item in unpacker: yield item repository = cache_if_remote(self.repository) num_archives = len(self.manifest.archives) for i, (name, info) in enumerate(list(self.manifest.archives.items()), 1): self.report_progress('Analyzing archive {} ({}/{})'.format(name, i, num_archives)) archive_id = info[b'id'] if not archive_id in self.chunks: self.report_progress('Archive metadata block is missing', error=True) del self.manifest.archives[name] continue mark_as_possibly_superseded(archive_id) cdata = self.repository.get(archive_id) data = self.key.decrypt(archive_id, cdata) archive = StableDict(msgpack.unpackb(data)) if archive[b'version'] != 1: raise Exception('Unknown archive metadata version') decode_dict(archive, (b'name', b'hostname', b'username', b'time')) # fixme: argv items_buffer = ChunkBuffer(self.key) items_buffer.write_chunk = add_callback for item in robust_iterator(archive): if b'chunks' in item: verify_file_chunks(item) items_buffer.add(item) items_buffer.flush(flush=True) for previous_item_id in archive[b'items']: mark_as_possibly_superseded(previous_item_id) archive[b'items'] = items_buffer.chunks data = msgpack.packb(archive, unicode_errors='surrogateescape') new_archive_id = self.key.id_hash(data) cdata = self.key.encrypt(data) add_reference(new_archive_id, len(data), len(cdata), cdata) info[b'id'] = new_archive_id