def cancel(self): """Kills the program and all its child processes. Removes the runtime dir. Raises an exception if the top level process is still running after :attr:`.CANCEL_TIMEOUT` seconds. :raises RuntimeError: If the program is still running. """ parent_pid = self._pid node = self._node tree = ' '.join([str(pid) for pid in ptree(pid=parent_pid, node=node)]) def cancel_task(): """Kills the process tree and fails if the parent is still running after a timeout.""" node.run("kill {tree}" "; kill -0 {parent_pid} && exit 1 || exit 0".format( tree=tree, parent_pid=parent_pid)) log = get_logger(__name__) with remove_runtime_dir_on_exit(node=self._node, runtime_dir=self._runtime_dir): with stage_debug(log, "Killing the process tree for pid: %d", self._pid): retry_with_config(fun=cancel_task, name=Retry.CANCEL_DEPLOYMENT, config=self._node.config)
def test_third_try_succeeds_one_retry_with_config(): failures = [] config = ClusterConfigImpl( host='h', port=1, user='******', auth=AuthMethod.ASK, retries={Retry.PORT_INFO: RetryConfigImpl(count=1, seconds_between=0)}) with pytest.raises(RuntimeError): retry_with_config( fun=lambda: failing_task(fail_times=2, failures=failures), name=Retry.PORT_INFO, config=config) assert failures == [0, 1]
def connect_to_each_node(nodes: Sequence[Node], config: ClusterConfig): """Connects to each node to make sure any connection issues come up before attempting to actually deploy anything. :param nodes: Nodes to deploy Dask on. :param config: Cluster config. """ log = get_logger(__name__) node_count = len(nodes) for i, node in enumerate(nodes): with stage_info(log, "Connecting to %s:%d (%d/%d).", node.host, node.port, i + 1, node_count): retry_with_config(node.connect, name=Retry.DASK_NODE_CONNECT, config=config)
def check_scheduler_reachable_from_nodes(nodes: Sequence[Node], scheduler: DaskSchedulerDeployment, config: ClusterConfig): """Checks whether connection to the scheduler is possible from each node, before deploying workers. :param nodes: Nodes to deploy Dask on. :param scheduler: Scheduler to connect to. :param config: Cluster config. """ log = get_logger(__name__) node_count = len(nodes) for i, node in enumerate(nodes): with stage_info(log, "Checking scheduler connectivity from %s (%d/%d).", node.host, i + 1, node_count): retry_with_config(lambda n=node: check_scheduler_reachable( node=n, scheduler=scheduler), name=Retry.SCHEDULER_CONNECT, config=config)
def finalize_allocation(allocation_id: int, hostnames: List[str], nodes: List[NodeImpl], parameters: AllocationParameters, allocated_until: datetime.datetime, config: ClusterConfig): """Fetches node ports and makes them allocated. :param allocation_id: Allocation id, e.g. Slurm job id. :param hostnames: List of hostnames. :param nodes: Nodes to update with information. :param parameters: Allocation parameters. :param allocated_until: Timestamp for job termination. :param config: Cluster config. """ def try_to_determine_ports(): return determine_ports_for_nodes(allocation_id=allocation_id, hostnames=hostnames, config=config, raise_on_missing=True) try: node_count = len(hostnames) multiplier = int(ceil(node_count / 10)) ports = retry_with_config(try_to_determine_ports, name=Retry.PORT_INFO, config=config, multiplier=multiplier) except RuntimeError: ports = determine_ports_for_nodes(allocation_id=allocation_id, hostnames=hostnames, config=config, raise_on_missing=False) for host, port, node in zip(hostnames, ports, nodes): node.make_allocated( host=host, port=port, cores=parameters.cores, memory=parameters.memory_per_node, allocated_until=allocated_until)
def validate_tunnel_http_connection(tunnel: TunnelInternal): """Checks whether there is an HTTP server replying to a request through the tunnel. :param tunnel: Tunnel to validate. """ def access_server(): with requests.Session() as session: return session.get("http://127.0.0.1:{local_port}".format( local_port=tunnel.here)) request = retry_with_config(access_server, name=Retry.VALIDATE_HTTP_TUNNEL, config=tunnel.config) if "text/html" not in request.headers['Content-type']: raise RuntimeError("Unable to obtain a HTML response.")
def deploy_scheduler_on_first_node( nodes: Sequence[Node]) -> DaskSchedulerDeployment: # noqa """Deploys a scheduler on the first node in the node sequence. :param nodes: Nodes to deploy Dask on. """ log = get_logger(__name__) assert isinstance(nodes[0], NodeInternal) first_node = nodes[0] # type: NodeInternal with stage_info(log, "Deploying scheduler on the first node: %s.", first_node.host): scheduler = retry_with_config( lambda: deploy_dask_scheduler(node=first_node), name=Retry.DEPLOY_DASK_SCHEDULER, config=first_node.config) return scheduler
def allocate_slurm_nodes(parameters: AllocationParameters, config: ClusterConfig) -> Nodes: """Tries to allocate nodes using Slurm. :param parameters: Allocation parameters. :param config: Config for the cluster to allocate nodes on. """ args = SbatchArguments(params=parameters) log = get_logger(__name__) with stage_debug(log, "Executing sbatch on access node."): access_node = get_access_node(config=config) job_id, entry_point_script_path = run_sbatch(args=args, node=access_node) def run_squeue_task() -> SqueueResult: job_squeue = run_squeue(node=access_node) return job_squeue[job_id] try: with stage_debug(log, "Obtaining info about job %d using squeue.", job_id): job = retry_with_config(run_squeue_task, name=Retry.SQUEUE_AFTER_SBATCH, config=config) except Exception as e: # noqa, pylint: disable=broad-except run_scancel(job_id=job_id, node=access_node) raise RuntimeError("Unable to obtain job info" " after allocation.") from e node_count = job.node_count nodes = [NodeImpl(config=config) for _ in range(node_count)] allocation = SlurmAllocation( job_id=job_id, access_node=access_node, nodes=nodes, entry_point_script_path=entry_point_script_path, parameters=parameters) return NodesImpl(nodes=nodes, allocation=allocation)
def tunnel(self, there: int, here: Optional[int] = None) -> TunnelInternal: try: log = get_logger(__name__) with stage_debug(log, "Opening tunnel %s -> %d to %s", here, there, self): self._ensure_allocated() here, there = validate_tunnel_ports(here=here, there=there) first_try = [True] def get_bindings_and_build_tunnel() -> TunnelInternal: bindings = get_bindings_with_single_gateway( here=here if first_try[0] else ANY_TUNNEL_PORT, node_host=self._host, node_port=self._port, there=there) first_try[0] = False return build_tunnel(config=self._config, bindings=bindings, ssh_password=env.password, ssh_pkey=env.key_filename) with authenticate(host=self._host, port=self._port, config=self._config): if here == ANY_TUNNEL_PORT: return get_bindings_and_build_tunnel() return retry_with_config( get_bindings_and_build_tunnel, name=Retry.TUNNEL_TRY_AGAIN_WITH_ANY_PORT, config=self._config) except RuntimeError as e: raise RuntimeError( "Unable to tunnel {there} on node '{host}'.".format( there=there, host=self._host)) from e
def deploy_worker_on_node(node: Node, scheduler: DaskSchedulerDeployment, worker_number: int, worker_count: int) -> DaskWorkerDeployment: """Deploys a worker on the node. Retries on failure. :param node: Node to deploy a worker on. :param scheduler: Scheduler for the worker. :param worker_number: Worker number out of workers to deploy. :param worker_count: Count of workers being deployed """ log = get_logger(__name__) with stage_info(log, "Deploying worker %d/%d.", worker_number, worker_count): assert isinstance(node, NodeInternal) node_impl = node # type: NodeInternal worker = retry_with_config( lambda: deploy_dask_worker(node=node_impl, scheduler=scheduler), name=Retry.DEPLOY_DASK_WORKER, config=node_impl.config) return worker
def build_tunnel(config: ClusterConfig, bindings: List[Binding], ssh_password: Optional[str] = None, ssh_pkey: Optional[str] = None) -> TunnelInternal: """Builds a multi-hop tunnel from a sequence of bindings. :param config: Cluster config. :param bindings: Sequence of bindings, starting with the local binding. :param ssh_password: Ssh password. :param ssh_pkey: Ssh private key. """ if len(bindings) < 2: raise ValueError("At least one local and one remote binding" " is required to build a tunnel") with ExitStack() as stack: tunnels = [] log = get_logger(__name__) log.debug("Ssh username: %s", config.user) log.debug("Using password: %r", ssh_password is not None) log.debug("Using key file: %s", ssh_pkey) logger = get_debug_logger("{}/Tunnels".format(__name__)) # First hop if not the only one if len(bindings) != 2: with stage_debug(log, "Adding first hop."): ssh_address_or_host = (config.host, config.port) local_bind_address = ANY_ADDRESS remote_bind_address = bindings[1].as_tuple() log.debug("Ssh address is %s", ssh_address_or_host) log.debug("Local bind address is %s", local_bind_address) log.debug("Remote bind address is %s", remote_bind_address) def create_first_tunnel(): return FirstHopTunnel( forwarder=SSHTunnelForwarder( ssh_address_or_host, ssh_config_file=None, ssh_username=config.user, ssh_password=ssh_password, ssh_pkey=ssh_pkey, local_bind_address=local_bind_address, remote_bind_address=remote_bind_address, set_keepalive=TUNNEL_KEEPALIVE, allow_agent=False, logger=logger), there=bindings[1].port, config=config) tunnel = retry_with_config(create_first_tunnel, name=Retry.OPEN_TUNNEL, config=config) stack.enter_context(close_tunnel_on_failure(tunnel)) tunnels.append(tunnel) # Middle hops if any prev_tunnel = tunnels[0] if tunnels else None for i, next_binding in enumerate(bindings[2:-1]): with stage_debug(log, "Adding middle hop %d.", i): # Connect through previous tunnel ssh_address_or_host = ( "127.0.0.1", prev_tunnel.forwarder.local_bind_port) local_bind_address = ANY_ADDRESS remote_bind_address = next_binding.as_tuple() log.debug("Ssh address is %s", ssh_address_or_host) log.debug("Local bind address is %s", local_bind_address) log.debug("Remote bind address is %s", remote_bind_address) def create_middle_tunnel(): next_binding_port = next_binding.port # noqa, pylint: disable=cell-var-from-loop, line-too-long return FirstHopTunnel( forwarder=SSHTunnelForwarder( ssh_address_or_host, ssh_config_file=None, ssh_username=config.user, ssh_password=ssh_password, ssh_pkey=ssh_pkey, local_bind_address=local_bind_address, remote_bind_address=remote_bind_address, set_keepalive=TUNNEL_KEEPALIVE, allow_agent=False, logger=logger), there=next_binding_port, config=config) next_tunnel = retry_with_config(create_middle_tunnel, name=Retry.OPEN_TUNNEL, config=config) stack.enter_context(close_tunnel_on_failure(next_tunnel)) tunnels.append(next_tunnel) prev_tunnel = next_tunnel with stage_debug(log, "Adding last hop."): # Last hop last_hop_port = (config.port if len(bindings) == 2 else tunnels[-1].forwarder.local_bind_port) ssh_address_or_host = ("127.0.0.1", last_hop_port) local_bind_address = bindings[0].as_tuple() remote_bind_address = bindings[-1].as_tuple() log.debug("Ssh address is %s", ssh_address_or_host) log.debug("Local bind address is %s", local_bind_address) log.debug("Remote bind address is %s", remote_bind_address) def create_last_tunnel(): return FirstHopTunnel( forwarder=SSHTunnelForwarder( ssh_address_or_host, ssh_config_file=None, ssh_username=config.user, ssh_password=ssh_password, ssh_pkey=ssh_pkey, local_bind_address=local_bind_address, remote_bind_address=remote_bind_address, set_keepalive=TUNNEL_KEEPALIVE, allow_agent=False, logger=logger), there=bindings[-1].port, config=config) last_tunnel = retry_with_config(create_last_tunnel, name=Retry.OPEN_TUNNEL, config=config) stack.enter_context(close_tunnel_on_failure(last_tunnel)) tunnels.append(last_tunnel) if len(tunnels) == 1: return tunnels[0] return MultiHopTunnel(tunnels=tunnels, config=config)
def deploy_dask_worker(node: NodeInternal, scheduler: DaskSchedulerDeployment) -> DaskWorkerDeployment: # noqa, pylint: disable=line-too-long """Deploys a Dask worker on the node. :param node: Node to deploy on. :param scheduler: Already deployed scheduler. """ log = get_logger(__name__) with ExitStack() as stack: with stage_debug(log, "Creating a runtime dir."): runtime_dir = create_runtime_dir(node=node) stack.enter_context( remove_runtime_dir_on_failure(node=node, runtime_dir=runtime_dir)) with stage_debug(log, "Obtaining a free remote port."): bokeh_port = get_free_remote_port(node=node) with stage_debug(log, "Creating a scratch subdirectory."): scratch_subdir = create_scratch_subdir(node=node) log_file = create_log_file(node=node, runtime_dir=runtime_dir) script_contents = get_worker_deployment_script( scheduler_address=scheduler.address, bokeh_port=bokeh_port, scratch_subdir=scratch_subdir, cores=node.cores, memory_limit=node.memory, log_file=log_file, config=node.config) log.debug("Deployment script contents: %s", script_contents) with stage_debug(log, "Deploying script."): deployment = deploy_generic(node=node, script_contents=script_contents, runtime_dir=runtime_dir) stack.enter_context(cancel_on_failure(deployment)) @fabric.decorators.task def validate_worker_started_from_log(): """Checks that the worker has started correctly based on the log file.""" with capture_fabric_output_to_log(): output = get_remote_file(remote_path=log_file) log.debug("Log file: %s", output) validate_worker_started(output=output) with stage_debug(log, "Checking if worker started."): retry_with_config( lambda: node.run_task(task=validate_worker_started_from_log), name=Retry.CHECK_WORKER_STARTED, config=node.config) with stage_debug(log, "Opening a tunnel to bokeh diagnostics server."): bokeh_tunnel = node.tunnel(here=bokeh_port, there=bokeh_port) stack.enter_context(close_tunnel_on_failure(bokeh_tunnel)) log.debug("Diagnostics local port: %d", bokeh_tunnel.here) return DaskWorkerDeployment(deployment=deployment, bokeh_tunnel=bokeh_tunnel)
def deploy_jupyter(node: NodeInternal, local_port: int) -> JupyterDeployment: """Deploys a Jupyter Notebook server on the node, and creates a tunnel to a local port. :param node: Node to deploy Jupyter Notebook on. :param local_port: Local tunnel binding port. """ log = get_logger(__name__) with stage_debug(log, "Creating a runtime dir."): runtime_dir = create_runtime_dir(node=node) with stage_debug(log, "Obtaining a free remote port."): remote_port = get_free_remote_port(node=node) if node.config.use_jupyter_lab: jupyter_version = 'lab' else: jupyter_version = 'notebook' deployment_commands = [ 'export JUPYTER_RUNTIME_DIR="{runtime_dir}"'.format( runtime_dir=runtime_dir), get_command_to_append_local_bin() ] log_file = create_log_file(node=node, runtime_dir=runtime_dir) deployment_commands.append('jupyter {jupyter_version}' ' --ip 127.0.0.1' ' --port "{remote_port}"' ' --no-browser > {log_file} 2>&1'.format( jupyter_version=jupyter_version, remote_port=remote_port, log_file=log_file)) script_contents = get_deployment_script_contents( deployment_commands=deployment_commands, setup_actions=node.config.setup_actions.jupyter) log.debug("Deployment script contents: %s", script_contents) with stage_debug(log, "Deploying script."): deployment = deploy_generic(node=node, script_contents=script_contents, runtime_dir=runtime_dir) with cancel_on_failure(deployment): @fabric.decorators.task def load_nbserver_json(): """Loads notebook parameters from a json file.""" with capture_fabric_output_to_log(): with cd(runtime_dir): nbserver_json_path = run( "readlink -vf $PWD/nbserver-*.json").splitlines()[0] run("cat '{log_file}' || exit 0".format(log_file=log_file)) run("cat '{nbserver_json_path}' > /dev/null".format( nbserver_json_path=nbserver_json_path)) nbserver_json_str = get_remote_file(nbserver_json_path) nbserver_json = json.loads(nbserver_json_str) return int(nbserver_json['port']), nbserver_json['token'] with stage_debug(log, "Obtaining info about notebook from json file."): actual_port, token = retry_with_config( lambda: node.run_task(task=load_nbserver_json), name=Retry.JUPYTER_JSON, config=node.config) with stage_debug(log, "Opening a tunnel to notebook."): tunnel = node.tunnel(there=actual_port, here=local_port) return JupyterDeploymentImpl(deployment=deployment, tunnel=tunnel, token=token)
def deploy_dask_scheduler(node: NodeInternal) -> DaskSchedulerDeployment: """Deploys a Dask scheduler on the node. :param node: Node to deploy on. """ log = get_logger(__name__) with ExitStack() as stack: with stage_debug(log, "Creating a runtime dir."): runtime_dir = create_runtime_dir(node=node) stack.enter_context( remove_runtime_dir_on_failure(node=node, runtime_dir=runtime_dir)) with stage_debug(log, "Obtaining free remote ports."): remote_port, bokeh_port = get_free_remote_ports(count=2, node=node) with stage_debug(log, "Creating a scratch subdirectory."): scratch_subdir = create_scratch_subdir(node=node) log_file = create_log_file(node=node, runtime_dir=runtime_dir) script_contents = get_scheduler_deployment_script( remote_port=remote_port, bokeh_port=bokeh_port, scratch_subdir=scratch_subdir, log_file=log_file, config=node.config) log.debug("Deployment script contents: %s", script_contents) with stage_debug(log, "Deploying script."): deployment = deploy_generic(node=node, script_contents=script_contents, runtime_dir=runtime_dir) stack.enter_context(cancel_on_failure(deployment)) @fabric.decorators.task def extract_address_from_log() -> str: """Extracts scheduler address from a log file.""" with capture_fabric_output_to_log(): output = get_remote_file(remote_path=log_file) log.debug("Log file: %s", output) return extract_address_from_output(output=output) with stage_debug(log, "Obtaining scheduler address."): address = retry_with_config( lambda: node.run_task(task=extract_address_from_log), name=Retry.GET_SCHEDULER_ADDRESS, config=node.config) with stage_debug(log, "Opening a tunnel to scheduler."): tunnel = node.tunnel(here=remote_port, there=remote_port) stack.enter_context(close_tunnel_on_failure(tunnel)) log.debug("Scheduler local port: %d", tunnel.here) with stage_debug(log, "Opening a tunnel to bokeh diagnostics server."): bokeh_tunnel = node.tunnel(here=bokeh_port, there=bokeh_port) stack.enter_context(close_tunnel_on_failure(bokeh_tunnel)) log.debug("Diagnostics local port: %d", bokeh_tunnel.here) return DaskSchedulerDeployment(deployment=deployment, tunnel=tunnel, bokeh_tunnel=bokeh_tunnel, address=address)