def destroy_snapshots(shell: Shell, snapshots: [Snapshot]): for dataset, snapshots in sortedgroupby(snapshots, lambda snapshot: snapshot.dataset): names = {snapshot.name for snapshot in snapshots} logger.info("On %r for dataset %r destroying snapshots %r", shell, dataset, names) while names: chunk = set() sum_len = len(dataset) for name in sorted(names): new_sum_len = sum_len + len(name) + 1 if new_sum_len >= ARG_MAX: break chunk.add(name) sum_len = new_sum_len args = ["zfs", "destroy", f"{dataset}@" + ",".join(sorted(chunk))] try: shell.exec(args) names -= chunk except ExecException as e: if m := re.search(r"cannot destroy snapshot .+?@(.+?): dataset is busy", e.stdout): reason = "busy" name = m.group(1) elif m := re.search(r"cannot destroy '.+?@(.+?)': snapshot has dependent clones", e.stdout): reason = "cloned" name = m.group(1) else: raise logger.info("Snapshot %r on dataset %r is %s, skipping", name, dataset, reason) names.discard(name)
def destroy_snapshots(shell: Shell, snapshots: [Snapshot]): for dataset, snapshots in sortedgroupby(snapshots, lambda snapshot: snapshot.dataset): names = [snapshot.name for snapshot in snapshots] logger.info("On %r for dataset %r destroying snapshots %r", shell, dataset, names) args = ["zfs", "destroy", f"{dataset}@" + "%".join(names)] shell.exec(args)
def list_datasets(shell: Shell, dataset: str = None, recursive: bool = True): args = [ "zfs", "list", "-t", "filesystem", "-H", "-o", "name", "-s", "name" ] if recursive: args.extend(["-r"]) else: args.extend(["-d", "1"]) if dataset is not None: args.append(dataset) return list(filter(None, shell.exec(args).split("\n")))
def ensure_has_no_data(shell: Shell, dataset: str): try: dst_properties = get_properties( shell, dataset, { "type": str, "mounted": bool, "mountpoint": str, "referenced": int, "snapdir": str, "used": int, }) except ExecException as e: if not ("dataset does not exist" in e.stdout): raise else: if (dst_properties["type"] == "filesystem" and dst_properties["mounted"] and dst_properties["mountpoint"] != "legacy"): try: index = shell.ls(dst_properties["mountpoint"]) except Exception as e: logger.warning( "An exception occurred while listing dataset %r mountpoint %r: %r. Assuming dataset is not mounted", dataset, dst_properties["mountpoint"], e, ) else: if dst_properties["snapdir"] == "visible" and ".zfs" in index: index.remove(".zfs") if index: raise ReplicationError( f"Target dataset {dataset!r} does not have snapshots but has data (e.g. {index[0]!r} and " f"replication from scratch is not allowed. Refusing to overwrite existing data." ) return if dst_properties["type"] == "filesystem": used_property = "used" elif dst_properties["type"] == "volume": used_property = "referenced" else: raise ReplicationError( f"Target dataset {dataset!r} has invalid type {dst_properties['type']!r}" ) # Empty datasets on large pool configurations can have really big size if dst_properties[used_property] > 1024 * 1024 * 10: raise ReplicationError( f"Target dataset {dataset!r} does not have snapshots but has data ({dst_properties[used_property]} " f"bytes used) and replication from scratch is not allowed. Refusing to overwrite existing data." )
def list_snapshots(shell: Shell, dataset: str, recursive: bool) -> [Snapshot]: args = ["zfs", "list", "-t", "snapshot", "-H", "-o", "name", "-s", "name"] if recursive: args.extend(["-r"]) else: args.extend(["-d", "1"]) args.append(dataset) return list( map(lambda s: Snapshot(*s.split("@")), filter(None, shell.exec(args).split("\n"))))
def destroy_snapshots(shell: Shell, snapshots: [Snapshot]): for dataset, snapshots in sortedgroupby(snapshots, lambda snapshot: snapshot.dataset): names = [snapshot.name for snapshot in snapshots] logger.info("On %r for dataset %r destroying snapshots %r", shell, dataset, names) while names: args = ["zfs", "destroy", f"{dataset}@" + ",".join(names)] try: shell.exec(args) break except ExecException as e: m = re.search( r"cannot destroy snapshot .+?@(.+?): dataset is busy", e.stdout) if m is None: raise name = m.group(1) logger.info("Snapshot %r on dataset %r is busy, skipping", name, dataset) names.remove(name)
def list_datasets_with_properties(shell: Shell, dataset: str=None, recursive: bool=True, properties=None): properties = properties or [] properties = ["name"] + properties args = ["zfs", "list", "-t", "filesystem,volume", "-H", "-o", ",".join(properties), "-s", "name"] if recursive: args.extend(["-r"]) else: args.extend(["-d", "1"]) if dataset is not None: args.append(dataset) with ZfsCliExceptionHandler(): output = shell.exec(args) return [dict(zip(properties, line.split("\t"))) for line in filter(None, output.split("\n"))]
def list_datasets_with_properties(shell: Shell, dataset: str=None, recursive: bool=True, properties=None): properties = properties or {} properties["name"] = str args = ["zfs", "list", "-t", "filesystem,volume", "-H", "-o", ",".join(properties.keys()), "-s", "name"] if recursive: args.extend(["-r"]) else: args.extend(["-d", "1"]) if dataset is not None: args.append(dataset) with ZfsCliExceptionHandler(): output = shell.exec(args) return [ { property: parse_property(value, properties[property]) for property, value in zip(properties, line.split("\t")) } for line in filter(None, output.split("\n")) ]
def inspect_data(shell: Shell, dataset: str, exclude: [str]=None): exclude = exclude or [] try: dst_properties = get_properties(shell, dataset, { "type": str, "mounted": bool, "mountpoint": str, "referenced": int, "snapdir": str, "used": int, }) except ExecException as e: if "dataset does not exist" in e.stdout: return None, None raise else: if ( dst_properties["type"] == "filesystem" and dst_properties["mounted"] and dst_properties["mountpoint"] != "legacy" ): try: index = shell.ls(dst_properties["mountpoint"]) except Exception as e: logger.warning( "An exception occurred while listing dataset %r mountpoint %r on shell %r: %r. " "Assuming dataset is not mounted", dataset, dst_properties["mountpoint"], shell, e, ) else: if dst_properties["snapdir"] == "visible" and ".zfs" in index: index.remove(".zfs") for excluded in exclude: if excluded not in index: continue child_mountpoint = os.path.join(dst_properties["mountpoint"], excluded) try: if not shell.is_dir(child_mountpoint): continue except Exception as e: logger.warning( "An exception occurred while checking if %r on shell %r is a directory: %r. " "Assuming it is not", child_mountpoint, shell, e, ) continue child_dataset = os.path.join(dataset, excluded) try: child_properties = get_properties(shell, child_dataset, { "type": str, "mounted": bool, "mountpoint": str, }) except Exception as e: logger.warning( "An exception occurred while getting properties for dataset %r on shell %r: %r. " "Assuming it does not exist", child_dataset, shell, e, ) continue if child_properties["type"] == "filesystem": if child_properties["mounted"] and child_properties["mountpoint"] == child_mountpoint: index.remove(excluded) else: try: child_contents = shell.ls(child_mountpoint) except Exception as e: logger.warning( "An exception occurred while listing %r on shell %r: %r. Assuming it is not empty", child_mountpoint, shell, e, ) continue else: if not child_contents: index.remove(excluded) return index, dst_properties return None, dst_properties
def run_replication_tasks(local_shell: LocalShell, transport: Transport, remote_shell: Shell, replication_tasks: [ReplicationTask], observer=None): contexts = defaultdict(GlobalReplicationContext) replication_tasks_parts = calculate_replication_tasks_parts(replication_tasks) started_replication_tasks_ids = set() failed_replication_tasks_ids = set() replication_tasks_parts_left = { replication_task.id: len([1 for another_replication_task, source_dataset in replication_tasks_parts if another_replication_task == replication_task]) for replication_task in replication_tasks } for replication_task, source_dataset in replication_tasks_parts: if replication_task.id in failed_replication_tasks_ids: continue local_context = ReplicationContext(contexts[replication_task], None, local_shell) remote_context = ReplicationContext(contexts[replication_task], transport, remote_shell) if replication_task.direction == ReplicationDirection.PUSH: src_context = local_context dst_context = remote_context elif replication_task.direction == ReplicationDirection.PULL: src_context = remote_context dst_context = local_context else: raise ValueError(f"Invalid replication direction: {replication_task.direction!r}") if replication_task.id not in started_replication_tasks_ids: notify(observer, ReplicationTaskStart(replication_task.id)) started_replication_tasks_ids.add(replication_task.id) recoverable_error = None recoverable_sleep = 1 for i in range(replication_task.retries): if recoverable_error is not None: logger.info("After recoverable error sleeping for %d seconds", recoverable_sleep) time.sleep(recoverable_sleep) recoverable_sleep = min(recoverable_sleep * 2, 60) else: recoverable_sleep = 1 try: try: run_replication_task_part(replication_task, source_dataset, src_context, dst_context, observer) except socket.timeout: raise RecoverableReplicationError("Network connection timeout") from None except paramiko.ssh_exception.NoValidConnectionsError as e: raise RecoverableReplicationError(str(e).replace("[Errno None] ", "")) from None except paramiko.ssh_exception.SSHException as e: if isinstance(e, (paramiko.ssh_exception.AuthenticationException, paramiko.ssh_exception.BadHostKeyException, paramiko.ssh_exception.ProxyCommandFailure, paramiko.ssh_exception.ConfigParseError)): raise ReplicationError(str(e).replace("[Errno None] ", "")) from None else: # It might be an SSH error that leaves paramiko connection in an invalid state # Let's reset remote shell just in case remote_shell.close() raise RecoverableReplicationError(str(e).replace("[Errno None] ", "")) from None except (IOError, OSError) as e: raise RecoverableReplicationError(str(e)) from None replication_tasks_parts_left[replication_task.id] -= 1 if replication_tasks_parts_left[replication_task.id] == 0: notify(observer, ReplicationTaskSuccess(replication_task.id)) break except RecoverableReplicationError as e: logger.warning("For task %r at attempt %d recoverable replication error %r", replication_task.id, i + 1, e) recoverable_error = e except ReplicationError as e: logger.error("For task %r non-recoverable replication error %r", replication_task.id, e) notify(observer, ReplicationTaskError(replication_task.id, str(e))) failed_replication_tasks_ids.add(replication_task.id) break except Exception as e: logger.error("For task %r unhandled replication error %r", replication_task.id, e, exc_info=True) notify(observer, ReplicationTaskError(replication_task.id, str(e))) failed_replication_tasks_ids.add(replication_task.id) break else: logger.error("Failed replication task %r after %d retries", replication_task.id, replication_task.retries) notify(observer, ReplicationTaskError(replication_task.id, str(recoverable_error))) failed_replication_tasks_ids.add(replication_task.id)
def create_dataset(shell: Shell, dataset: str): shell.exec(["zfs", "create", dataset])
def is_empty_snapshot(shell: Shell, snapshot: Snapshot): return shell.exec(["zfs", "get", "-H", "-o", "value", "written", str(snapshot)]).strip() == "0"