def run_replication_task_part(replication_task: ReplicationTask, source_dataset: str, src_context: ReplicationContext, dst_context: ReplicationContext, observer): target_dataset = get_target_dataset(replication_task, source_dataset) check_target_type(replication_task, source_dataset, src_context, dst_context) step_templates = calculate_replication_step_templates( replication_task, source_dataset, src_context, dst_context) destroy_empty_encrypted_target(replication_task, source_dataset, dst_context) with DatasetSizeObserver( src_context.shell, dst_context.shell, source_dataset, target_dataset, lambda src_used, dst_used: notify( observer, ReplicationTaskDataProgress( replication_task.id, source_dataset, src_used, dst_used))): resumed = resume_replications(step_templates, observer) if resumed: step_templates = calculate_replication_step_templates( replication_task, source_dataset, src_context, dst_context) run_replication_steps(step_templates, observer) mount_dst_datasets(dst_context, target_dataset, replication_task.recursive)
def run_replication_step(step: ReplicationStep, observer=None): logger.info("For replication task %r: doing %s from %r to %r of snapshot=%r incremental_base=%r " "receive_resume_token=%r", step.replication_task.id, step.replication_task.direction.value, step.src_dataset, step.dst_dataset, step.snapshot, step.incremental_base, step.receive_resume_token) if step.replication_task.direction == ReplicationDirection.PUSH: local_context = step.src_context remote_context = step.dst_context elif step.replication_task.direction == ReplicationDirection.PULL: local_context = step.dst_context remote_context = step.src_context else: raise ValueError(f"Invalid replication direction: {step.replication_task.direction!r}") transport = remote_context.transport process = transport.replication_process( step.replication_task.id, transport, local_context.shell, remote_context.shell, step.replication_task.direction, step.src_dataset, step.dst_dataset, step.snapshot, step.replication_task.properties, step.replication_task.replicate, step.incremental_base, step.receive_resume_token, step.replication_task.compression, step.replication_task.speed_limit, step.replication_task.dedup, step.replication_task.large_block, step.replication_task.embed, step.replication_task.compressed) process.add_progress_observer( lambda snapshot, current, total: notify(observer, ReplicationTaskSnapshotProgress(step.replication_task.id, snapshot.split("@")[0], snapshot.split("@")[1], current, total))) monitor = ReplicationMonitor(step.dst_context.shell, step.dst_dataset) ReplicationProcessRunner(process, monitor).run() notify(observer, ReplicationTaskSnapshotSuccess(step.replication_task.id, step.src_dataset, step.snapshot))
def _spawn_replication_tasks(self, replication_tasks): with self.tasks_lock: for replication_task in replication_tasks: if replication_task in self.pending_tasks: logger.debug("Replication task %r is already pending") continue if self._can_spawn_replication_task(replication_task): self._spawn_replication_task(replication_task) else: logger.info( "Replication task %r can't execute in parallel with already running tasks, " "delaying it", replication_task) notify(self.observer, ReplicationTaskScheduled(replication_task.id)) self.pending_tasks.append(replication_task) if not self.pending_tasks and not self.running_tasks and not self.retention_running: self._spawn_retention()
def _run_periodic_snapshot_tasks(self, now, tasks): tasks_with_snapshot_names = sorted( [(task, now.strftime(task.naming_schema)) for task in tasks], key=lambda task_with_snapshot_name: ( # Lexicographically less snapshots should go first task_with_snapshot_name[1], # Recursive snapshot with same name as non-recursive should go first 0 if task_with_snapshot_name[0].recursive else 1, # Recursive snapshots without exclude should go first 0 if not task_with_snapshot_name[0].exclude else 1, )) created_snapshots = set() for task, snapshot_name in tasks_with_snapshot_names: snapshot = Snapshot(task.dataset, snapshot_name) if snapshot in created_snapshots: continue options = notify(self.observer, PeriodicSnapshotTaskStart(task.id)) try: create_snapshot(self.local_shell, snapshot, task.recursive, task.exclude, options.properties) except CreateSnapshotError as e: logger.warning("Error creating %r: %r", snapshot, e) notify(self.observer, PeriodicSnapshotTaskError(task.id, str(e))) else: logger.info("Created %r", snapshot) created_snapshots.add(snapshot) notify(self.observer, PeriodicSnapshotTaskSuccess(task.id)) empty_snapshots = get_empty_snapshots_for_deletion( self.local_shell, tasks_with_snapshot_names) if empty_snapshots: logger.info("Destroying empty snapshots: %r", empty_snapshots) destroy_snapshots(self.local_shell, empty_snapshots)
def _run_periodic_snapshot_tasks(self, now, tasks): scheduled_tasks = [] for task in tasks: snapshot_name = get_snapshot_name(now, task.naming_schema) try: parsed_snapshot_name = parse_snapshot_name( snapshot_name, task.naming_schema) except ValueError as e: logger.warning( "Unable to parse snapshot name %r with naming schema %r: %s. Skipping task %r", snapshot_name, task.naming_schema, str(e), task, ) notify( self.observer, PeriodicSnapshotTaskError( task.id, "Unable to parse snapshot name %r: %s" % ( snapshot_name, str(e), ))) continue scheduled_tasks.append( ScheduledPeriodicSnapshotTask( task, snapshot_name, parsed_snapshot_name, )) scheduled_tasks = sorted( scheduled_tasks, key=lambda scheduled_task: ( # Common sorting order parsed_snapshot_sort_key(scheduled_task.parsed_snapshot_name), # Recursive snapshot with same name as non-recursive should go first 0 if scheduled_task.task.recursive else 1, # Recursive snapshots without exclude should go first 0 if not scheduled_task.task.exclude else 1, )) tasks_with_snapshot_names = [(scheduled_task.task, scheduled_task.snapshot_name) for scheduled_task in scheduled_tasks] created_snapshots = set() for task, snapshot_name in tasks_with_snapshot_names: snapshot = Snapshot(task.dataset, snapshot_name) if snapshot in created_snapshots: notify(self.observer, PeriodicSnapshotTaskSuccess(task.id)) continue options = notify(self.observer, PeriodicSnapshotTaskStart(task.id)) try: create_snapshot(self.local_shell, snapshot, task.recursive, task.exclude, options.properties) except CreateSnapshotError as e: logger.warning("Error creating %r: %r", snapshot, e) notify(self.observer, PeriodicSnapshotTaskError(task.id, str(e))) else: logger.info("Created %r", snapshot) created_snapshots.add(snapshot) notify(self.observer, PeriodicSnapshotTaskSuccess(task.id)) empty_snapshots = get_empty_snapshots_for_deletion( self.local_shell, tasks_with_snapshot_names) if empty_snapshots: logger.info("Destroying empty snapshots: %r", empty_snapshots) destroy_snapshots(self.local_shell, empty_snapshots)
def run_replication_tasks(local_shell: LocalShell, transport: Transport, remote_shell: Shell, replication_tasks: [ReplicationTask], observer=None): replication_tasks_parts = calculate_replication_tasks_parts(replication_tasks) started_replication_tasks_ids = set() failed_replication_tasks_ids = set() replication_tasks_parts_left = { replication_task.id: len([1 for another_replication_task, source_dataset in replication_tasks_parts if another_replication_task == replication_task]) for replication_task in replication_tasks } for replication_task, source_dataset in replication_tasks_parts: if replication_task.id in failed_replication_tasks_ids: continue local_context = ReplicationContext(None, local_shell) remote_context = ReplicationContext(transport, remote_shell) if replication_task.direction == ReplicationDirection.PUSH: src_context = local_context dst_context = remote_context elif replication_task.direction == ReplicationDirection.PULL: src_context = remote_context dst_context = local_context else: raise ValueError(f"Invalid replication direction: {replication_task.direction!r}") if replication_task.id not in started_replication_tasks_ids: notify(observer, ReplicationTaskStart(replication_task.id)) started_replication_tasks_ids.add(replication_task.id) recoverable_error = None recoverable_sleep = 1 for i in range(replication_task.retries): if recoverable_error is not None: logger.info("After recoverable error sleeping for %d seconds", recoverable_sleep) time.sleep(recoverable_sleep) recoverable_sleep = min(recoverable_sleep * 2, 60) else: recoverable_sleep = 1 try: try: run_replication_task_part(replication_task, source_dataset, src_context, dst_context, observer) except socket.timeout: raise RecoverableReplicationError("Network connection timeout") from None except paramiko.ssh_exception.NoValidConnectionsError as e: raise RecoverableReplicationError(str(e).replace("[Errno None] ", "")) from None except (IOError, OSError) as e: raise RecoverableReplicationError(str(e)) from None replication_tasks_parts_left[replication_task.id] -= 1 if replication_tasks_parts_left[replication_task.id] == 0: notify(observer, ReplicationTaskSuccess(replication_task.id)) break except RecoverableReplicationError as e: logger.warning("For task %r at attempt %d recoverable replication error %r", replication_task.id, i + 1, e) recoverable_error = e except ReplicationError as e: logger.error("For task %r non-recoverable replication error %r", replication_task.id, e) notify(observer, ReplicationTaskError(replication_task.id, str(e))) failed_replication_tasks_ids.add(replication_task.id) break except Exception as e: logger.error("For task %r unhandled replication error %r", replication_task.id, e, exc_info=True) notify(observer, ReplicationTaskError(replication_task.id, str(e))) failed_replication_tasks_ids.add(replication_task.id) break else: logger.error("Failed replication task %r after %d retries", replication_task.id, replication_task.retries) notify(observer, ReplicationTaskError(replication_task.id, str(recoverable_error))) failed_replication_tasks_ids.add(replication_task.id)
def run_replication_step(step: ReplicationStep, observer=None, observer_snapshot=None): logger.info( "For replication task %r: doing %s from %r to %r of snapshot=%r incremental_base=%r receive_resume_token=%r " "encryption=%r", step.replication_task.id, step.replication_task.direction.value, step.src_dataset, step.dst_dataset, step.snapshot, step.incremental_base, step.receive_resume_token, step.encryption is not None, ) observer_snapshot = observer_snapshot or step.snapshot notify(observer, ReplicationTaskSnapshotStart( step.replication_task.id, step.src_dataset, observer_snapshot, step.src_context.context.snapshots_sent, step.src_context.context.snapshots_total, )) # Umount target dataset because we will be overwriting its contents and children mountpoints # will become dangling. ZFS will mount entire directory structure again after receiving. try: step.dst_context.shell.exec(["zfs", "umount", step.dst_dataset]) except ExecException: pass if step.replication_task.direction == ReplicationDirection.PUSH: local_context = step.src_context remote_context = step.dst_context elif step.replication_task.direction == ReplicationDirection.PULL: local_context = step.dst_context remote_context = step.src_context else: raise ValueError(f"Invalid replication direction: {step.replication_task.direction!r}") transport = remote_context.transport process = transport.replication_process( step.replication_task.id, transport, local_context.shell, remote_context.shell, step.replication_task.direction, step.src_dataset, step.dst_dataset, step.snapshot, step.replication_task.properties, step.replication_task.properties_exclude, step.replication_task.properties_override, step.replication_task.replicate, step.encryption, step.incremental_base, step.receive_resume_token, step.replication_task.compression, step.replication_task.speed_limit, step.replication_task.dedup, step.replication_task.large_block, step.replication_task.embed, step.replication_task.compressed, step.replication_task.properties and step.src_context.datasets_encrypted[step.src_dataset], ) process.add_progress_observer( lambda bytes_sent, bytes_total: notify(observer, ReplicationTaskSnapshotProgress( step.replication_task.id, step.src_dataset, observer_snapshot, step.src_context.context.snapshots_sent, step.src_context.context.snapshots_total, bytes_sent, bytes_total, )) ) monitor = ReplicationMonitor(step.dst_context.shell, step.dst_dataset) ReplicationProcessRunner(process, monitor).run() step.template.src_context.context.snapshots_sent_by_replication_step_template[step.template] += 1 notify(observer, ReplicationTaskSnapshotSuccess( step.replication_task.id, step.src_dataset, observer_snapshot, step.src_context.context.snapshots_sent, step.src_context.context.snapshots_total, )) if step.incremental_base is None: # Might have created dataset, need to set it to readonly handle_readonly(step.template)
def run_replication_tasks(local_shell: LocalShell, transport: Transport, remote_shell: Shell, replication_tasks: [ReplicationTask], observer=None): contexts = defaultdict(GlobalReplicationContext) replication_tasks_parts = calculate_replication_tasks_parts( replication_tasks) started_replication_tasks_ids = set() failed_replication_tasks_ids = set() replication_tasks_parts_left = { replication_task.id: len([ 1 for another_replication_task, source_dataset in replication_tasks_parts if another_replication_task == replication_task ]) for replication_task in replication_tasks } for replication_task, source_dataset in replication_tasks_parts: if replication_task.id in failed_replication_tasks_ids: continue local_context = ReplicationContext(contexts[replication_task], None, local_shell) remote_context = ReplicationContext(contexts[replication_task], transport, remote_shell) if replication_task.direction == ReplicationDirection.PUSH: src_context = local_context dst_context = remote_context elif replication_task.direction == ReplicationDirection.PULL: src_context = remote_context dst_context = local_context else: raise ValueError( f"Invalid replication direction: {replication_task.direction!r}" ) if replication_task.id not in started_replication_tasks_ids: notify(observer, ReplicationTaskStart(replication_task.id)) started_replication_tasks_ids.add(replication_task.id) recoverable_error = None recoverable_sleep = 1 for i in range(replication_task.retries): if recoverable_error is not None: logger.info("After recoverable error sleeping for %d seconds", recoverable_sleep) time.sleep(recoverable_sleep) recoverable_sleep = min(recoverable_sleep * 2, 60) else: recoverable_sleep = 1 try: try: run_replication_task_part(replication_task, source_dataset, src_context, dst_context, observer) except socket.timeout: raise RecoverableReplicationError( "Network connection timeout") from None except paramiko.ssh_exception.NoValidConnectionsError as e: raise RecoverableReplicationError( str(e).replace("[Errno None] ", "")) from None except paramiko.ssh_exception.SSHException as e: if isinstance( e, (paramiko.ssh_exception.AuthenticationException, paramiko.ssh_exception.BadHostKeyException, paramiko.ssh_exception.ProxyCommandFailure, paramiko.ssh_exception.ConfigParseError)): raise ReplicationError( str(e).replace("[Errno None] ", "")) from None else: # It might be an SSH error that leaves paramiko connection in an invalid state # Let's reset remote shell just in case remote_shell.close() raise RecoverableReplicationError( str(e).replace("[Errno None] ", "")) from None except ExecException as e: if e.returncode == 128 + signal.SIGPIPE: for warning in warnings_from_zfs_success(e.stdout): contexts[replication_task].add_warning(warning) raise RecoverableReplicationError( broken_pipe_error(e.stdout)) else: raise except (IOError, OSError) as e: raise RecoverableReplicationError(str(e)) from None replication_tasks_parts_left[replication_task.id] -= 1 if replication_tasks_parts_left[replication_task.id] == 0: notify( observer, ReplicationTaskSuccess( replication_task.id, contexts[replication_task].warnings)) break except RecoverableReplicationError as e: logger.warning( "For task %r at attempt %d recoverable replication error %r", replication_task.id, i + 1, e) recoverable_error = e except ReplicationError as e: logger.error( "For task %r non-recoverable replication error %r", replication_task.id, e) notify(observer, ReplicationTaskError(replication_task.id, str(e))) failed_replication_tasks_ids.add(replication_task.id) break except Exception as e: logger.error("For task %r unhandled replication error %r", replication_task.id, e, exc_info=True) notify(observer, ReplicationTaskError(replication_task.id, str(e))) failed_replication_tasks_ids.add(replication_task.id) break else: logger.error("Failed replication task %r after %d retries", replication_task.id, replication_task.retries) notify( observer, ReplicationTaskError(replication_task.id, str(recoverable_error))) failed_replication_tasks_ids.add(replication_task.id)
def run_replication_step(step: ReplicationStep, observer=None, observer_snapshot=None): logger.info( "For replication task %r: doing %s from %r to %r of snapshot=%r incremental_base=%r " "receive_resume_token=%r", step.replication_task.id, step.replication_task.direction.value, step.src_dataset, step.dst_dataset, step.snapshot, step.incremental_base, step.receive_resume_token) observer_snapshot = observer_snapshot or step.snapshot notify( observer, ReplicationTaskSnapshotStart( step.replication_task.id, step.src_dataset, observer_snapshot, step.src_context.context.snapshots_sent, step.src_context.context.snapshots_total, )) if step.replication_task.direction == ReplicationDirection.PUSH: local_context = step.src_context remote_context = step.dst_context elif step.replication_task.direction == ReplicationDirection.PULL: local_context = step.dst_context remote_context = step.src_context else: raise ValueError( f"Invalid replication direction: {step.replication_task.direction!r}" ) transport = remote_context.transport process = transport.replication_process( step.replication_task.id, transport, local_context.shell, remote_context.shell, step.replication_task.direction, step.src_dataset, step.dst_dataset, step.snapshot, step.replication_task.properties, step.replication_task.replicate, step.incremental_base, step.receive_resume_token, step.replication_task.compression, step.replication_task.speed_limit, step.replication_task.dedup, step.replication_task.large_block, step.replication_task.embed, step.replication_task.compressed) process.add_progress_observer(lambda bytes_sent, bytes_total: notify( observer, ReplicationTaskSnapshotProgress( step.replication_task.id, step.src_dataset, observer_snapshot, step.src_context.context.snapshots_sent, step.src_context.context.snapshots_total, bytes_sent, bytes_total, ))) monitor = ReplicationMonitor(step.dst_context.shell, step.dst_dataset) ReplicationProcessRunner(process, monitor).run() step.template.src_context.context.snapshots_sent_by_replication_step_template[ step.template] += 1 notify( observer, ReplicationTaskSnapshotSuccess( step.replication_task.id, step.src_dataset, observer_snapshot, step.src_context.context.snapshots_sent, step.src_context.context.snapshots_total, )) if step.incremental_base is None: # Might have created dataset, need to set it to readonly handle_readonly(step.template)