Пример #1
0
    def cancel(self):
        """Kills the program and all its child processes.
            Removes the runtime dir.
            Raises an exception if the top level process is still running
            after :attr:`.CANCEL_TIMEOUT` seconds.

            :raises RuntimeError: If the program is still running.

        """

        parent_pid = self._pid
        node = self._node
        tree = ' '.join([str(pid) for pid in ptree(pid=parent_pid, node=node)])

        def cancel_task():
            """Kills the process tree and fails if the parent is still
                running after a timeout."""
            node.run("kill {tree}"
                     "; kill -0 {parent_pid} && exit 1 || exit 0".format(
                         tree=tree, parent_pid=parent_pid))

        log = get_logger(__name__)
        with remove_runtime_dir_on_exit(node=self._node,
                                        runtime_dir=self._runtime_dir):
            with stage_debug(log, "Killing the process tree for pid: %d",
                             self._pid):
                retry_with_config(fun=cancel_task,
                                  name=Retry.CANCEL_DEPLOYMENT,
                                  config=self._node.config)
Пример #2
0
def test_third_try_succeeds_one_retry_with_config():
    failures = []
    config = ClusterConfigImpl(
        host='h',
        port=1,
        user='******',
        auth=AuthMethod.ASK,
        retries={Retry.PORT_INFO: RetryConfigImpl(count=1, seconds_between=0)})
    with pytest.raises(RuntimeError):
        retry_with_config(
            fun=lambda: failing_task(fail_times=2, failures=failures),
            name=Retry.PORT_INFO,
            config=config)
    assert failures == [0, 1]
Пример #3
0
def connect_to_each_node(nodes: Sequence[Node], config: ClusterConfig):
    """Connects to each node to make sure any connection issues come up
        before attempting to actually deploy anything.

         :param nodes: Nodes to deploy Dask on.

         :param config: Cluster config.

    """
    log = get_logger(__name__)
    node_count = len(nodes)
    for i, node in enumerate(nodes):
        with stage_info(log, "Connecting to %s:%d (%d/%d).", node.host,
                        node.port, i + 1, node_count):
            retry_with_config(node.connect,
                              name=Retry.DASK_NODE_CONNECT,
                              config=config)
Пример #4
0
def check_scheduler_reachable_from_nodes(nodes: Sequence[Node],
                                         scheduler: DaskSchedulerDeployment,
                                         config: ClusterConfig):
    """Checks whether connection to the scheduler is possible from each node,
        before deploying workers.

         :param nodes: Nodes to deploy Dask on.

         :param scheduler: Scheduler to connect to.

         :param config: Cluster config.

    """
    log = get_logger(__name__)
    node_count = len(nodes)
    for i, node in enumerate(nodes):
        with stage_info(log,
                        "Checking scheduler connectivity from %s (%d/%d).",
                        node.host, i + 1, node_count):
            retry_with_config(lambda n=node: check_scheduler_reachable(
                node=n, scheduler=scheduler),
                              name=Retry.SCHEDULER_CONNECT,
                              config=config)
Пример #5
0
def finalize_allocation(allocation_id: int,
                        hostnames: List[str],
                        nodes: List[NodeImpl],
                        parameters: AllocationParameters,
                        allocated_until: datetime.datetime,
                        config: ClusterConfig):
    """Fetches node ports and makes them allocated.

        :param allocation_id: Allocation id, e.g. Slurm job id.

        :param hostnames: List of hostnames.

        :param nodes: Nodes to update with information.

        :param parameters: Allocation parameters.

        :param allocated_until: Timestamp for job termination.

        :param config: Cluster config.

    """

    def try_to_determine_ports():
        return determine_ports_for_nodes(allocation_id=allocation_id,
                                         hostnames=hostnames,
                                         config=config,
                                         raise_on_missing=True)

    try:
        node_count = len(hostnames)
        multiplier = int(ceil(node_count / 10))
        ports = retry_with_config(try_to_determine_ports,
                                  name=Retry.PORT_INFO,
                                  config=config,
                                  multiplier=multiplier)
    except RuntimeError:
        ports = determine_ports_for_nodes(allocation_id=allocation_id,
                                          hostnames=hostnames,
                                          config=config,
                                          raise_on_missing=False)

    for host, port, node in zip(hostnames, ports, nodes):
        node.make_allocated(
            host=host,
            port=port,
            cores=parameters.cores,
            memory=parameters.memory_per_node,
            allocated_until=allocated_until)
Пример #6
0
def validate_tunnel_http_connection(tunnel: TunnelInternal):
    """Checks whether there is an HTTP server replying to a request through
        the tunnel.

         :param tunnel: Tunnel to validate.

    """

    def access_server():
        with requests.Session() as session:
            return session.get("http://127.0.0.1:{local_port}".format(
                local_port=tunnel.here))

    request = retry_with_config(access_server,
                                name=Retry.VALIDATE_HTTP_TUNNEL,
                                config=tunnel.config)
    if "text/html" not in request.headers['Content-type']:
        raise RuntimeError("Unable to obtain a HTML response.")
Пример #7
0
def deploy_scheduler_on_first_node(
        nodes: Sequence[Node]) -> DaskSchedulerDeployment:  # noqa
    """Deploys a scheduler on the first node in the node sequence.

        :param nodes: Nodes to deploy Dask on.

    """
    log = get_logger(__name__)
    assert isinstance(nodes[0], NodeInternal)
    first_node = nodes[0]  # type: NodeInternal

    with stage_info(log, "Deploying scheduler on the first node: %s.",
                    first_node.host):
        scheduler = retry_with_config(
            lambda: deploy_dask_scheduler(node=first_node),
            name=Retry.DEPLOY_DASK_SCHEDULER,
            config=first_node.config)
        return scheduler
Пример #8
0
def allocate_slurm_nodes(parameters: AllocationParameters,
                         config: ClusterConfig) -> Nodes:
    """Tries to allocate nodes using Slurm.

       :param parameters:   Allocation parameters.

       :param config: Config for the cluster to allocate nodes on.

    """
    args = SbatchArguments(params=parameters)

    log = get_logger(__name__)
    with stage_debug(log, "Executing sbatch on access node."):
        access_node = get_access_node(config=config)
        job_id, entry_point_script_path = run_sbatch(args=args,
                                                     node=access_node)

    def run_squeue_task() -> SqueueResult:
        job_squeue = run_squeue(node=access_node)
        return job_squeue[job_id]

    try:
        with stage_debug(log, "Obtaining info about job %d using squeue.",
                         job_id):
            job = retry_with_config(run_squeue_task,
                                    name=Retry.SQUEUE_AFTER_SBATCH,
                                    config=config)
    except Exception as e:  # noqa, pylint: disable=broad-except
        run_scancel(job_id=job_id, node=access_node)
        raise RuntimeError("Unable to obtain job info"
                           " after allocation.") from e

    node_count = job.node_count
    nodes = [NodeImpl(config=config) for _ in range(node_count)]

    allocation = SlurmAllocation(
        job_id=job_id,
        access_node=access_node,
        nodes=nodes,
        entry_point_script_path=entry_point_script_path,
        parameters=parameters)

    return NodesImpl(nodes=nodes,
                     allocation=allocation)
Пример #9
0
    def tunnel(self, there: int, here: Optional[int] = None) -> TunnelInternal:
        try:
            log = get_logger(__name__)
            with stage_debug(log, "Opening tunnel %s -> %d to %s", here, there,
                             self):
                self._ensure_allocated()

                here, there = validate_tunnel_ports(here=here, there=there)

                first_try = [True]

                def get_bindings_and_build_tunnel() -> TunnelInternal:
                    bindings = get_bindings_with_single_gateway(
                        here=here if first_try[0] else ANY_TUNNEL_PORT,
                        node_host=self._host,
                        node_port=self._port,
                        there=there)
                    first_try[0] = False
                    return build_tunnel(config=self._config,
                                        bindings=bindings,
                                        ssh_password=env.password,
                                        ssh_pkey=env.key_filename)

                with authenticate(host=self._host,
                                  port=self._port,
                                  config=self._config):
                    if here == ANY_TUNNEL_PORT:
                        return get_bindings_and_build_tunnel()
                    return retry_with_config(
                        get_bindings_and_build_tunnel,
                        name=Retry.TUNNEL_TRY_AGAIN_WITH_ANY_PORT,
                        config=self._config)

        except RuntimeError as e:
            raise RuntimeError(
                "Unable to tunnel {there} on node '{host}'.".format(
                    there=there, host=self._host)) from e
Пример #10
0
def deploy_worker_on_node(node: Node, scheduler: DaskSchedulerDeployment,
                          worker_number: int,
                          worker_count: int) -> DaskWorkerDeployment:
    """Deploys a worker on the node. Retries on failure.

        :param node: Node to deploy a worker on.

        :param scheduler: Scheduler for the worker.

        :param worker_number: Worker number out of workers to deploy.

        :param worker_count: Count of workers being deployed

    """
    log = get_logger(__name__)
    with stage_info(log, "Deploying worker %d/%d.", worker_number,
                    worker_count):
        assert isinstance(node, NodeInternal)
        node_impl = node  # type: NodeInternal
        worker = retry_with_config(
            lambda: deploy_dask_worker(node=node_impl, scheduler=scheduler),
            name=Retry.DEPLOY_DASK_WORKER,
            config=node_impl.config)
        return worker
Пример #11
0
def build_tunnel(config: ClusterConfig,
                 bindings: List[Binding],
                 ssh_password: Optional[str] = None,
                 ssh_pkey: Optional[str] = None) -> TunnelInternal:
    """Builds a multi-hop tunnel from a sequence of bindings.

        :param config:   Cluster config.

        :param bindings: Sequence of bindings, starting with the local binding.

        :param ssh_password: Ssh password.

        :param ssh_pkey: Ssh private key.
    """
    if len(bindings) < 2:
        raise ValueError("At least one local and one remote binding"
                         " is required to build a tunnel")

    with ExitStack() as stack:
        tunnels = []

        log = get_logger(__name__)
        log.debug("Ssh username: %s", config.user)
        log.debug("Using password: %r", ssh_password is not None)
        log.debug("Using key file: %s", ssh_pkey)

        logger = get_debug_logger("{}/Tunnels".format(__name__))

        # First hop if not the only one
        if len(bindings) != 2:
            with stage_debug(log, "Adding first hop."):
                ssh_address_or_host = (config.host, config.port)
                local_bind_address = ANY_ADDRESS
                remote_bind_address = bindings[1].as_tuple()
                log.debug("Ssh address is %s", ssh_address_or_host)
                log.debug("Local bind address is %s", local_bind_address)
                log.debug("Remote bind address is %s", remote_bind_address)

                def create_first_tunnel():
                    return FirstHopTunnel(
                        forwarder=SSHTunnelForwarder(
                            ssh_address_or_host,
                            ssh_config_file=None,
                            ssh_username=config.user,
                            ssh_password=ssh_password,
                            ssh_pkey=ssh_pkey,
                            local_bind_address=local_bind_address,
                            remote_bind_address=remote_bind_address,
                            set_keepalive=TUNNEL_KEEPALIVE,
                            allow_agent=False,
                            logger=logger),
                        there=bindings[1].port,
                        config=config)

                tunnel = retry_with_config(create_first_tunnel,
                                           name=Retry.OPEN_TUNNEL,
                                           config=config)
                stack.enter_context(close_tunnel_on_failure(tunnel))
                tunnels.append(tunnel)

        # Middle hops if any
        prev_tunnel = tunnels[0] if tunnels else None
        for i, next_binding in enumerate(bindings[2:-1]):
            with stage_debug(log, "Adding middle hop %d.", i):
                # Connect through previous tunnel
                ssh_address_or_host = (
                    "127.0.0.1",
                    prev_tunnel.forwarder.local_bind_port)
                local_bind_address = ANY_ADDRESS
                remote_bind_address = next_binding.as_tuple()
                log.debug("Ssh address is %s", ssh_address_or_host)
                log.debug("Local bind address is %s", local_bind_address)
                log.debug("Remote bind address is %s", remote_bind_address)

                def create_middle_tunnel():
                    next_binding_port = next_binding.port  # noqa, pylint: disable=cell-var-from-loop, line-too-long
                    return FirstHopTunnel(
                        forwarder=SSHTunnelForwarder(
                            ssh_address_or_host,
                            ssh_config_file=None,
                            ssh_username=config.user,
                            ssh_password=ssh_password,
                            ssh_pkey=ssh_pkey,
                            local_bind_address=local_bind_address,
                            remote_bind_address=remote_bind_address,
                            set_keepalive=TUNNEL_KEEPALIVE,
                            allow_agent=False,
                            logger=logger),
                        there=next_binding_port,
                        config=config)

                next_tunnel = retry_with_config(create_middle_tunnel,
                                                name=Retry.OPEN_TUNNEL,
                                                config=config)
                stack.enter_context(close_tunnel_on_failure(next_tunnel))
                tunnels.append(next_tunnel)
                prev_tunnel = next_tunnel

        with stage_debug(log, "Adding last hop."):
            # Last hop
            last_hop_port = (config.port
                             if len(bindings) == 2
                             else tunnels[-1].forwarder.local_bind_port)

            ssh_address_or_host = ("127.0.0.1", last_hop_port)
            local_bind_address = bindings[0].as_tuple()
            remote_bind_address = bindings[-1].as_tuple()
            log.debug("Ssh address is %s", ssh_address_or_host)
            log.debug("Local bind address is %s", local_bind_address)
            log.debug("Remote bind address is %s", remote_bind_address)

            def create_last_tunnel():
                return FirstHopTunnel(
                    forwarder=SSHTunnelForwarder(
                        ssh_address_or_host,
                        ssh_config_file=None,
                        ssh_username=config.user,
                        ssh_password=ssh_password,
                        ssh_pkey=ssh_pkey,
                        local_bind_address=local_bind_address,
                        remote_bind_address=remote_bind_address,
                        set_keepalive=TUNNEL_KEEPALIVE,
                        allow_agent=False,
                        logger=logger),
                    there=bindings[-1].port,
                    config=config)

            last_tunnel = retry_with_config(create_last_tunnel,
                                            name=Retry.OPEN_TUNNEL,
                                            config=config)
            stack.enter_context(close_tunnel_on_failure(last_tunnel))
            tunnels.append(last_tunnel)

        if len(tunnels) == 1:
            return tunnels[0]
        return MultiHopTunnel(tunnels=tunnels,
                              config=config)
Пример #12
0
def deploy_dask_worker(node: NodeInternal,
                       scheduler: DaskSchedulerDeployment) -> DaskWorkerDeployment:  # noqa, pylint: disable=line-too-long
    """Deploys a Dask worker on the node.

        :param node: Node to deploy on.

        :param scheduler: Already deployed scheduler.

    """
    log = get_logger(__name__)

    with ExitStack() as stack:
        with stage_debug(log, "Creating a runtime dir."):
            runtime_dir = create_runtime_dir(node=node)
            stack.enter_context(
                remove_runtime_dir_on_failure(node=node,
                                              runtime_dir=runtime_dir))

        with stage_debug(log, "Obtaining a free remote port."):
            bokeh_port = get_free_remote_port(node=node)

        with stage_debug(log, "Creating a scratch subdirectory."):
            scratch_subdir = create_scratch_subdir(node=node)

        log_file = create_log_file(node=node, runtime_dir=runtime_dir)

        script_contents = get_worker_deployment_script(
            scheduler_address=scheduler.address,
            bokeh_port=bokeh_port,
            scratch_subdir=scratch_subdir,
            cores=node.cores,
            memory_limit=node.memory,
            log_file=log_file,
            config=node.config)

        log.debug("Deployment script contents: %s", script_contents)

        with stage_debug(log, "Deploying script."):
            deployment = deploy_generic(node=node,
                                        script_contents=script_contents,
                                        runtime_dir=runtime_dir)
            stack.enter_context(cancel_on_failure(deployment))

        @fabric.decorators.task
        def validate_worker_started_from_log():
            """Checks that the worker has started correctly based on
                the log file."""
            with capture_fabric_output_to_log():
                output = get_remote_file(remote_path=log_file)
            log.debug("Log file: %s", output)
            validate_worker_started(output=output)

        with stage_debug(log, "Checking if worker started."):
            retry_with_config(
                lambda: node.run_task(task=validate_worker_started_from_log),
                name=Retry.CHECK_WORKER_STARTED,
                config=node.config)

        with stage_debug(log, "Opening a tunnel to bokeh diagnostics server."):
            bokeh_tunnel = node.tunnel(here=bokeh_port, there=bokeh_port)
            stack.enter_context(close_tunnel_on_failure(bokeh_tunnel))
            log.debug("Diagnostics local port: %d", bokeh_tunnel.here)

        return DaskWorkerDeployment(deployment=deployment,
                                    bokeh_tunnel=bokeh_tunnel)
Пример #13
0
def deploy_jupyter(node: NodeInternal, local_port: int) -> JupyterDeployment:
    """Deploys a Jupyter Notebook server on the node, and creates a tunnel
        to a local port.

        :param node: Node to deploy Jupyter Notebook on.

        :param local_port: Local tunnel binding port.

    """
    log = get_logger(__name__)

    with stage_debug(log, "Creating a runtime dir."):
        runtime_dir = create_runtime_dir(node=node)

    with stage_debug(log, "Obtaining a free remote port."):
        remote_port = get_free_remote_port(node=node)

    if node.config.use_jupyter_lab:
        jupyter_version = 'lab'
    else:
        jupyter_version = 'notebook'

    deployment_commands = [
        'export JUPYTER_RUNTIME_DIR="{runtime_dir}"'.format(
            runtime_dir=runtime_dir),
        get_command_to_append_local_bin()
    ]

    log_file = create_log_file(node=node, runtime_dir=runtime_dir)

    deployment_commands.append('jupyter {jupyter_version}'
                               ' --ip 127.0.0.1'
                               ' --port "{remote_port}"'
                               ' --no-browser > {log_file} 2>&1'.format(
                                   jupyter_version=jupyter_version,
                                   remote_port=remote_port,
                                   log_file=log_file))

    script_contents = get_deployment_script_contents(
        deployment_commands=deployment_commands,
        setup_actions=node.config.setup_actions.jupyter)

    log.debug("Deployment script contents: %s", script_contents)
    with stage_debug(log, "Deploying script."):
        deployment = deploy_generic(node=node,
                                    script_contents=script_contents,
                                    runtime_dir=runtime_dir)

    with cancel_on_failure(deployment):

        @fabric.decorators.task
        def load_nbserver_json():
            """Loads notebook parameters from a json file."""
            with capture_fabric_output_to_log():
                with cd(runtime_dir):
                    nbserver_json_path = run(
                        "readlink -vf $PWD/nbserver-*.json").splitlines()[0]
                run("cat '{log_file}' || exit 0".format(log_file=log_file))
                run("cat '{nbserver_json_path}' > /dev/null".format(
                    nbserver_json_path=nbserver_json_path))
                nbserver_json_str = get_remote_file(nbserver_json_path)
                nbserver_json = json.loads(nbserver_json_str)
                return int(nbserver_json['port']), nbserver_json['token']

        with stage_debug(log, "Obtaining info about notebook from json file."):
            actual_port, token = retry_with_config(
                lambda: node.run_task(task=load_nbserver_json),
                name=Retry.JUPYTER_JSON,
                config=node.config)

        with stage_debug(log, "Opening a tunnel to notebook."):
            tunnel = node.tunnel(there=actual_port, here=local_port)

        return JupyterDeploymentImpl(deployment=deployment,
                                     tunnel=tunnel,
                                     token=token)
Пример #14
0
def deploy_dask_scheduler(node: NodeInternal) -> DaskSchedulerDeployment:
    """Deploys a Dask scheduler on the node.

        :param node: Node to deploy on.

    """
    log = get_logger(__name__)

    with ExitStack() as stack:
        with stage_debug(log, "Creating a runtime dir."):
            runtime_dir = create_runtime_dir(node=node)
            stack.enter_context(
                remove_runtime_dir_on_failure(node=node,
                                              runtime_dir=runtime_dir))

        with stage_debug(log, "Obtaining free remote ports."):
            remote_port, bokeh_port = get_free_remote_ports(count=2, node=node)

        with stage_debug(log, "Creating a scratch subdirectory."):
            scratch_subdir = create_scratch_subdir(node=node)

        log_file = create_log_file(node=node, runtime_dir=runtime_dir)

        script_contents = get_scheduler_deployment_script(
            remote_port=remote_port,
            bokeh_port=bokeh_port,
            scratch_subdir=scratch_subdir,
            log_file=log_file,
            config=node.config)

        log.debug("Deployment script contents: %s", script_contents)

        with stage_debug(log, "Deploying script."):
            deployment = deploy_generic(node=node,
                                        script_contents=script_contents,
                                        runtime_dir=runtime_dir)
            stack.enter_context(cancel_on_failure(deployment))

        @fabric.decorators.task
        def extract_address_from_log() -> str:
            """Extracts scheduler address from a log file."""
            with capture_fabric_output_to_log():
                output = get_remote_file(remote_path=log_file)
            log.debug("Log file: %s", output)
            return extract_address_from_output(output=output)

        with stage_debug(log, "Obtaining scheduler address."):
            address = retry_with_config(
                lambda: node.run_task(task=extract_address_from_log),
                name=Retry.GET_SCHEDULER_ADDRESS,
                config=node.config)

        with stage_debug(log, "Opening a tunnel to scheduler."):
            tunnel = node.tunnel(here=remote_port, there=remote_port)
            stack.enter_context(close_tunnel_on_failure(tunnel))
            log.debug("Scheduler local port: %d", tunnel.here)

        with stage_debug(log, "Opening a tunnel to bokeh diagnostics server."):
            bokeh_tunnel = node.tunnel(here=bokeh_port, there=bokeh_port)
            stack.enter_context(close_tunnel_on_failure(bokeh_tunnel))
            log.debug("Diagnostics local port: %d", bokeh_tunnel.here)

        return DaskSchedulerDeployment(deployment=deployment,
                                       tunnel=tunnel,
                                       bokeh_tunnel=bokeh_tunnel,
                                       address=address)