def test_able_to_sync_jupyter():
    user = USER_47
    with ExitStack() as stack:
        stack.enter_context(disable_pytest_stdin())
        stack.enter_context(set_up_key_location(user))
        stack.enter_context(reset_environment(user))
        stack.enter_context(set_password(get_test_user_password(user)))
        stack.enter_context(clear_deployment_sync_data(user))

        cluster = show_cluster(name=TEST_CLUSTER)

        nodes = cluster.allocate_nodes()
        stack.enter_context(cancel_on_exit(nodes))
        node = nodes[0]
        nodes.wait(timeout=SLURM_WAIT_TIMEOUT)

        local_port = get_free_local_port()
        jupyter = node.deploy_notebook(local_port=local_port)
        stack.enter_context(cancel_on_exit(jupyter))

        deployments = cluster.pull_deployments()
        assert not deployments.jupyter_deployments

        cluster.push_deployment(deployment=jupyter)
        deployments = cluster.pull_deployments()
        print(deployments)

        assert len(deployments.jupyter_deployments) == 1
        jupyter_2 = deployments.jupyter_deployments[0]
        try:
            assert jupyter.local_port != jupyter_2.local_port
            check_local_http_connection(port=jupyter.local_port)
            check_local_http_connection(port=jupyter_2.local_port)
        finally:
            jupyter_2.cancel_local()
Exemplo n.º 2
0
def test_cancelled_dask_allocation_is_discarded_on_pull():
    user = USER_56
    with ExitStack() as stack:
        stack.enter_context(disable_pytest_stdin())
        stack.enter_context(set_up_key_location(user))
        stack.enter_context(reset_environment(user))
        stack.enter_context(set_password(get_test_user_password(user)))
        stack.enter_context(clear_deployment_sync_data(user))

        cluster = show_cluster(name=TEST_CLUSTER)

        nodes = cluster.allocate_nodes()
        stack.enter_context(cancel_on_exit(nodes))
        nodes.wait(timeout=SLURM_WAIT_TIMEOUT)

        dask = deploy_dask(nodes)
        stack.enter_context(cancel_on_exit(dask))

        try:
            deployments = cluster.pull_deployments()
            assert not deployments.jupyter_deployments

            cluster.push_deployment(deployment=dask)

            dask.cancel()
            dask = None

            deployments = cluster.pull_deployments()
            assert not deployments.jupyter_deployments
        finally:
            if dask is not None:
                dask.cancel()
Exemplo n.º 3
0
def test_able_to_sync_dask():
    user = USER_55
    with ExitStack() as stack:
        stack.enter_context(disable_pytest_stdin())
        stack.enter_context(set_up_key_location(user))
        stack.enter_context(reset_environment(user))
        stack.enter_context(set_password(get_test_user_password(user)))
        stack.enter_context(clear_deployment_sync_data(user))

        cluster = show_cluster(name=TEST_CLUSTER)

        nodes = cluster.allocate_nodes()
        stack.enter_context(cancel_on_exit(nodes))
        nodes.wait(timeout=SLURM_WAIT_TIMEOUT)

        dask = deploy_dask(nodes)
        stack.enter_context(cancel_on_exit(dask))

        deployments = cluster.pull_deployments()
        assert not deployments.dask_deployments

        cluster.push_deployment(deployment=dask)
        deployments = cluster.pull_deployments()
        print(deployments)

        assert len(deployments.dask_deployments) == 1
        dask_2 = deployments.dask_deployments[0]
        try:
            assert dask.diagnostics.addresses != dask_2.diagnostics.addresses
            for url in dask.diagnostics.addresses:
                check_http_connection(url=url)
            for url in dask_2.diagnostics.addresses:
                check_http_connection(url=url)
        finally:
            dask_2.cancel_local()
def test_cancelled_jupyter_allocation_is_discarded_on_pull():
    user = USER_48
    with ExitStack() as stack:
        stack.enter_context(disable_pytest_stdin())
        stack.enter_context(set_up_key_location(user))
        stack.enter_context(reset_environment(user))
        stack.enter_context(set_password(get_test_user_password(user)))
        stack.enter_context(clear_deployment_sync_data(user))

        cluster = show_cluster(name=TEST_CLUSTER)

        nodes = cluster.allocate_nodes()
        stack.enter_context(cancel_on_exit(nodes))
        node = nodes[0]
        nodes.wait(timeout=SLURM_WAIT_TIMEOUT)

        local_port = get_free_local_port()
        jupyter = node.deploy_notebook(local_port=local_port)
        try:
            deployments = cluster.pull_deployments()
            assert not deployments.jupyter_deployments

            cluster.push_deployment(deployment=jupyter)

            jupyter.cancel()
            jupyter = None

            deployments = cluster.pull_deployments()
            assert not deployments.jupyter_deployments
        finally:
            if jupyter is not None:
                jupyter.cancel()
Exemplo n.º 5
0
def test_nodes_sync_does_not_work_when_waiting_twice():
    """Port info was already deleted, so waiting for the second time defaults
        to port 22."""
    user = USER_44
    with ExitStack() as stack:
        stack.enter_context(disable_pytest_stdin())
        stack.enter_context(set_up_key_location(user))
        stack.enter_context(reset_environment(user))
        stack.enter_context(set_password(get_test_user_password(user)))
        stack.enter_context(clear_deployment_sync_data(user))

        cluster = show_cluster(name=TEST_CLUSTER)

        nodes = cluster.allocate_nodes()
        with cancel_on_exit(nodes):
            cluster.push_deployment(deployment=nodes)

            nodes.wait(timeout=SLURM_WAIT_TIMEOUT)
            assert nodes.running()
            node = nodes[0]
            assert node.port != 22

            deployments = cluster.pull_deployments()
            assert len(deployments.nodes) == 1
            nodes_2 = deployments.nodes[0]

            nodes_2.wait(timeout=SLURM_WAIT_TIMEOUT)
            assert nodes_2.running()
            node_2 = nodes_2[0]

            assert node_2.port == 22
            assert node_2.host == node.host
Exemplo n.º 6
0
def test_dask_deployment_with_redeploy_failure():
    user = USER_42
    with ExitStack() as stack:
        stack.enter_context(disable_pytest_stdin())
        stack.enter_context(set_up_key_location(user))
        stack.enter_context(reset_environment(user))
        stack.enter_context(set_password(get_test_user_password(user)))

        cluster = show_cluster(name=TEST_CLUSTER)
        nodes = cluster.allocate_nodes(nodes=2,
                                       cores=1,
                                       memory_per_node=MiB(100),
                                       walltime=Walltime(minutes=30))
        stack.enter_context(cancel_on_exit(nodes))

        assert idact.detail.dask.deploy_dask_impl.validate_worker
        stored_validate_worker = \
            idact.detail.dask.deploy_dask_impl.validate_worker

        def fake_validate_worker(worker: DaskWorkerDeployment):
            print("Fake worker validation.")
            raise ValueError("Fake worker validation fail.")

        try:
            idact.detail.dask.deploy_dask_impl.validate_worker = \
                fake_validate_worker

            with pytest.raises(RuntimeError):
                with deploy_dask_on_testing_cluster(nodes):
                    pass

        finally:
            idact.detail.dask.deploy_dask_impl.validate_worker = \
                stored_validate_worker
Exemplo n.º 7
0
 def cancel(self):
     log = get_logger(__name__)
     with ExitStack() as stack:
         stack.enter_context(
             stage_info(log, "Cancelling Jupyter deployment."))
         stack.enter_context(cancel_on_exit(self._deployment))
         self.cancel_local()
Exemplo n.º 8
0
def discard_invalid_workers(workers: List[DaskWorkerDeployment],
                            stack: ExitStack) \
    -> Tuple[
        List[DaskWorkerDeployment],
        List[Node]]:
    """Validates each worker. Returns a tuple of valid workers and nodes
        for which the workers could not be validated.

        :param workers: Workers to validate.

        :param stack: Exit stack. Failed workers will be cancelled on exit.

    """
    log = get_logger(__name__)
    valid_workers = []
    nodes_to_redeploy = []
    worker_count = len(workers)
    for i, worker in enumerate(workers):
        try:
            with stage_info(log, "Validating worker %d/%d.", i + 1,
                            worker_count):
                validate_worker(worker=worker)
            valid_workers.append(worker)
        except Exception:  # noqa, pylint: disable=broad-except
            log.debug("Failed to validate worker. Exception:", exc_info=1)
            nodes_to_redeploy.append(worker.deployment.node)
            stack.enter_context(cancel_on_exit(worker))

    return valid_workers, nodes_to_redeploy
Exemplo n.º 9
0
def test_remove_runtime_dir_test():
    user = USER_15
    with ExitStack() as stack:
        stack.enter_context(set_up_key_location(user))
        stack.enter_context(disable_pytest_stdin())
        stack.enter_context(reset_environment(user))
        stack.enter_context(set_password(get_test_user_password(user)))

        cluster = show_cluster(name=TEST_CLUSTER)
        nodes = cluster.allocate_nodes(nodes=1,
                                       cores=1,
                                       memory_per_node=MiB(100),
                                       walltime=Walltime(minutes=30))
        stack.enter_context(cancel_on_exit(nodes))
        node = nodes[0]
        try:
            nodes.wait(timeout=SLURM_WAIT_TIMEOUT)
            assert nodes.running()

            check_will_remove_empty(node=node)
            check_will_ignore_non_existent(node=node)
            check_will_remove_files(node=node)
            check_will_not_remove_dotfiles(node=node)
            check_will_not_remove_nested_dirs(node=node)
        finally:
            node.run("rm -rf *")
Exemplo n.º 10
0
def deploy_jupyter(nodes: Nodes):
    ps_jupyter = "ps -u $USER | grep jupyter ; exit 0"

    node = nodes[0]
    nodes.wait(timeout=SLURM_WAIT_TIMEOUT)
    assert nodes.running()

    local_port = get_free_local_port()
    deployment = node.deploy_notebook(local_port=local_port)
    with cancel_on_exit(deployment):
        print(deployment)
        assert str(deployment) == repr(deployment)

        assert deployment.local_port == local_port

        ps_jupyter_lines = node.run(ps_jupyter).splitlines()
        pprint(ps_jupyter_lines)
        assert len(ps_jupyter_lines) == 1

        check_local_http_connection(port=local_port)

        yield node

    retry(lambda: check_no_output(node=node, command=ps_jupyter),
          retries=5 * get_testing_process_count(),
          seconds_between_retries=1)
Exemplo n.º 11
0
def check_remote_key_and_node_access(stack: ExitStack, user: str):
    public_key_value = get_public_key_value()

    cluster = show_cluster(name=TEST_CLUSTER)
    node = cluster.get_access_node()
    with set_password(get_test_user_password(user)):
        assert node.run('whoami') == user
    assert node.run('whoami') == user

    node.run("grep '{public_key_value}' ~/.ssh/authorized_keys".format(
        public_key_value=public_key_value))

    with pytest.raises(RuntimeError):
        node.run(
            "grep '{public_key_value}' ~/.ssh/authorized_keys.idact".format(
                public_key_value=public_key_value))

    nodes = cluster.allocate_nodes(nodes=2,
                                   cores=1,
                                   memory_per_node=MiB(100),
                                   walltime=Walltime(minutes=30))
    stack.enter_context(cancel_on_exit(nodes))
    print(nodes)

    nodes.wait(timeout=SLURM_WAIT_TIMEOUT)
    node.run("grep '{public_key_value}' ~/.ssh/authorized_keys.idact".format(
        public_key_value=public_key_value))

    # Access to node without password works.
    assert nodes[0].run('whoami') == user

    check_direct_access_from_access_node_does_not_work(nodes[0])
Exemplo n.º 12
0
def test_node_tunnel_fall_back_when_local_port_taken():
    """Checks that a tunnel will fall back to a random port if local port is
        taken."""
    user = USER_53
    with ExitStack() as stack:
        stack.enter_context(disable_pytest_stdin())
        stack.enter_context(set_up_key_location(user))
        stack.enter_context(reset_environment(user))
        stack.enter_context(set_password(get_test_user_password(user)))

        cluster = show_cluster(name=TEST_CLUSTER)
        nodes = cluster.allocate_nodes(nodes=1,
                                       cores=1,
                                       memory_per_node=MiB(100),
                                       walltime=Walltime(minutes=30))
        stack.enter_context(cancel_on_exit(nodes))

        node = nodes[0]
        nodes.wait(timeout=SLURM_WAIT_TIMEOUT)

        there = get_free_remote_port(node=node)
        here = get_free_local_port()

        tunnel_1 = node.tunnel(there=there, here=here)
        stack.enter_context(close_tunnel_on_exit(tunnel_1))
        assert tunnel_1.here == here

        tunnel_2 = node.tunnel(there=there, here=here)
        stack.enter_context(close_tunnel_on_exit(tunnel_2))
        assert tunnel_2.here != here
Exemplo n.º 13
0
 def cancel(self):
     """Cancels the scheduler deployment."""
     log = get_logger(__name__)
     with ExitStack() as stack:
         stack.enter_context(
             stage_info(log, "Cancelling scheduler deployment on %s.",
                        self._deployment.node.host))
         stack.enter_context(cancel_on_exit(self._deployment))
         self.cancel_local()
Exemplo n.º 14
0
def deploy_dask_on_testing_cluster(nodes: Nodes):
    ps_dask_worker = "ps -u $USER | grep [d]ask-worker ; exit 0"
    ps_dask_scheduler = "ps -u $USER | grep [d]ask-scheduler ; exit 0"

    node = nodes[0]
    nodes.wait(timeout=SLURM_WAIT_TIMEOUT)
    assert nodes.running()

    ps_lines = node.run(ps_dask_worker).splitlines()
    pprint(ps_lines)
    assert not ps_lines

    deployment = deploy_dask(nodes=nodes)
    with cancel_on_exit(deployment):
        print(deployment)
        assert str(deployment) == repr(deployment)

        ps_lines = node.run(ps_dask_scheduler).splitlines()
        pprint(ps_lines)
        assert len(ps_lines) == 1

        ps_lines = node.run(ps_dask_worker).splitlines()
        pprint(ps_lines)
        # some workers may have been redeployed
        assert len(ps_lines) >= len(nodes)

        client = deployment.get_client()
        print(client)

        check_submission_works(node=node, client=client)

        pprint(deployment.diagnostics.addresses)
        assert len(deployment.diagnostics.addresses) == len(nodes) + 1

        for address in deployment.diagnostics.addresses:
            request = requests.get(address)
            assert "text/html" in request.headers['Content-type']

        opened_addresses = []
        with save_opened_in(opened_addresses):
            deployment.diagnostics.open_all()

        assert opened_addresses == deployment.diagnostics.addresses

        yield node

    retry(lambda: check_no_output(node=node, command=ps_dask_scheduler),
          retries=5 * get_testing_process_count(), seconds_between_retries=1)

    retry(lambda: check_no_output(node=node, command=ps_dask_worker),
          retries=5 * get_testing_process_count(), seconds_between_retries=1)
Exemplo n.º 15
0
def test_node_tunnel_stress():
    user = USER_40
    with ExitStack() as stack:
        stack.enter_context(disable_pytest_stdin())
        stack.enter_context(set_up_key_location(user))
        stack.enter_context(reset_environment(user))
        stack.enter_context(set_password(get_test_user_password(user)))

        cluster = show_cluster(name=TEST_CLUSTER)
        nodes = cluster.allocate_nodes(nodes=1,
                                       cores=1,
                                       memory_per_node=MiB(100))
        stack.enter_context(cancel_on_exit(nodes))
        run_tunnel_stress_test(stack=stack, user=user, nodes=nodes)
Exemplo n.º 16
0
def test_generic_deployment():
    user = USER_7
    with ExitStack() as stack:
        stack.enter_context(disable_pytest_stdin())
        stack.enter_context(set_up_key_location(user))
        stack.enter_context(reset_environment(user))
        stack.enter_context(set_password(get_test_user_password(user)))

        cluster = show_cluster(name=TEST_CLUSTER)
        print(cluster)
        nodes = cluster.allocate_nodes(nodes=1,
                                       cores=1,
                                       memory_per_node=MiB(100),
                                       walltime=Walltime(minutes=30))
        stack.enter_context(cancel_on_exit(nodes))
        node = nodes[0]

        nodes.wait(timeout=SLURM_WAIT_TIMEOUT)
        assert nodes.running()

        assert isinstance(node, NodeInternal)
        runtime_dir = create_runtime_dir(node=node)
        stack.enter_context(
            remove_runtime_dir_on_failure(node=node, runtime_dir=runtime_dir))
        script_contents = "echo ABC && sleep 30"

        assert isinstance(node, NodeInternal)
        deployment = deploy_generic(node=node,
                                    script_contents=script_contents,
                                    runtime_dir=runtime_dir)
        with cancel_on_exit(deployment):
            print(deployment)

            node.run("kill -0 {pid}".format(pid=deployment.pid))

        with pytest.raises(RuntimeError):
            node.run("kill -0 {pid}".format(pid=deployment.pid))
Exemplo n.º 17
0
def test_node_tunnel_fall_back_when_local_port_free_but_fails():
    """Checks that a tunnel will fall back to a random port if local port is
        is initially free, but tunnel cannot be created anyway (e.g. another
        process binds to it at the last moment)."""
    user = USER_54
    with ExitStack() as stack:
        stack.enter_context(disable_pytest_stdin())
        stack.enter_context(set_up_key_location(user))
        stack.enter_context(reset_environment(user))
        stack.enter_context(set_password(get_test_user_password(user)))

        cluster = show_cluster(name=TEST_CLUSTER)
        nodes = cluster.allocate_nodes(nodes=1,
                                       cores=1,
                                       memory_per_node=MiB(100),
                                       walltime=Walltime(minutes=30))
        stack.enter_context(cancel_on_exit(nodes))

        node = nodes[0]
        nodes.wait(timeout=SLURM_WAIT_TIMEOUT)

        there = get_free_remote_port(node=node)
        here = get_free_local_port()

        real_build_tunnel = idact.detail.nodes.node_impl.build_tunnel
        sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)

        tries = [0]

        def fake_build_tunnel(*args, **kwargs) -> TunnelInternal:
            tries[0] += 1
            if tries[0] == 1:
                raise RuntimeError("Fake failure.")
            if tries[0] != 2:
                assert False

            return real_build_tunnel(*args, **kwargs)

        try:
            idact.detail.nodes.node_impl.build_tunnel = fake_build_tunnel
            tunnel = node.tunnel(there=there, here=here)
            stack.enter_context(close_tunnel_on_exit(tunnel))
            assert tries[0] == 2
            assert tunnel.here != here
        finally:
            idact.detail.nodes.node_impl.build_tunnel = real_build_tunnel
            sock.close()
Exemplo n.º 18
0
def test_jupyter_deployment():
    user = USER_6
    with ExitStack() as stack:
        stack.enter_context(disable_pytest_stdin())
        stack.enter_context(set_up_key_location(user))
        stack.enter_context(reset_environment(user))
        stack.enter_context(set_password(get_test_user_password(user)))

        cluster = show_cluster(name=TEST_CLUSTER)
        nodes = cluster.allocate_nodes(nodes=1,
                                       cores=1,
                                       memory_per_node=MiB(100),
                                       walltime=Walltime(minutes=30))
        stack.enter_context(cancel_on_exit(nodes))

        with deploy_jupyter(nodes):
            pass
Exemplo n.º 19
0
def test_able_to_reach_nodes_when_using_password_based_authentication():
    """It should be possible to connect to compute nodes even when using
        password-based authentication, because local public key is authorized
        for the compute nodes after initial connection.
        However, direct connection from access node should fail.
        Password is still used between the client and the access node."""
    user = USER_10
    with ExitStack() as stack:
        stack.enter_context(set_up_key_location(user))
        stack.enter_context(reset_environment(user=user, auth=AuthMethod.ASK))
        stack.enter_context(set_password(get_test_user_password(user)))
        stack.enter_context(disable_pytest_stdin())
        cluster = show_cluster(TEST_CLUSTER)
        node = cluster.get_access_node()

        nodes = cluster.allocate_nodes(nodes=2,
                                       cores=1,
                                       memory_per_node=MiB(100),
                                       walltime=Walltime(minutes=30))
        stack.enter_context(cancel_on_exit(nodes))
        print(nodes)

        nodes.wait(timeout=SLURM_WAIT_TIMEOUT)

        compute_node = nodes[0]
        assert isinstance(compute_node, NodeInternal)

        public_key_value = get_public_key_value()

        # Local key was installed for the deployed sshd, allowing access
        # between the access node and compute nodes.
        assert nodes[0].run('whoami') == user

        # Local key was not installed for the access node
        with pytest.raises(RuntimeError):
            node.run("grep '{public_key_value}' ~/.ssh/authorized_keys".format(
                public_key_value=public_key_value))

        # But it was installed for compute nodes.
        node.run("grep '{public_key_value}'"
                 " ~/.ssh/authorized_keys.idact".format(
                     public_key_value=public_key_value))

        check_direct_access_from_access_node_does_not_work(nodes[0])
Exemplo n.º 20
0
def test_dask_deployment_with_absolute_scratch_path():
    user = USER_24
    with ExitStack() as stack:
        stack.enter_context(disable_pytest_stdin())
        stack.enter_context(set_up_key_location(user))
        stack.enter_context(reset_environment(user))
        stack.enter_context(set_password(get_test_user_password(user)))

        cluster = show_cluster(name=TEST_CLUSTER)
        cluster.config.scratch = '/home/user-24'

        nodes = cluster.allocate_nodes(nodes=1,
                                       cores=1,
                                       memory_per_node=MiB(100),
                                       walltime=Walltime(minutes=10))
        stack.enter_context(cancel_on_exit(nodes))

        with deploy_dask_on_testing_cluster(nodes):
            pass
Exemplo n.º 21
0
def test_migrate_deployments():
    """Migrating from an old version of the deployments file should work
        without fatal errors."""
    user = USER_57
    with ExitStack() as stack:
        stack.enter_context(disable_pytest_stdin())
        stack.enter_context(set_up_key_location(user))
        stack.enter_context(reset_environment(user))
        stack.enter_context(set_password(get_test_user_password(user)))
        stack.enter_context(clear_deployment_sync_data(user))

        cluster = show_cluster(name=TEST_CLUSTER)
        access_node = cluster.get_access_node()
        assert isinstance(access_node, NodeInternal)

        def check_deployments_file_exists():
            access_node.run("cat ~/.idact/.deployments")

        nodes = cluster.allocate_nodes()
        stack.enter_context(cancel_on_exit(nodes))
        nodes.wait(timeout=SLURM_WAIT_TIMEOUT)

        with pytest.raises(RuntimeError):
            check_deployments_file_exists()

        remote_path = access_node.run("echo ~/.idact/.deployments")
        put_file_on_node(node=access_node,
                         remote_path=remote_path,
                         contents='{{"type": "{type}"}}'.format(
                             type=SerializableTypes.DEPLOYMENT_DEFINITIONS))

        deployments = cluster.pull_deployments()
        assert not deployments.nodes

        cluster.push_deployment(deployment=nodes)

        deployments = cluster.pull_deployments()
        assert len(deployments.nodes) == 1

        cluster.clear_pushed_deployments()

        with pytest.raises(RuntimeError):
            check_deployments_file_exists()
Exemplo n.º 22
0
def run_tunnel_test(user: str, nodes: Nodes):
    node = nodes[0]
    nodes.wait(timeout=SLURM_WAIT_TIMEOUT)
    assert nodes.running()
    with ExitStack() as stack:
        stack.enter_context(cancel_on_exit(nodes))
        there = get_free_remote_port(node=nodes[0])
        here = get_free_local_port()
        server = start_dummy_server_thread(user=user, server_port=there)
        stack.enter_context(join_on_exit(server))

        tunnel = node.tunnel(there=there, here=here)
        stack.enter_context(close_tunnel_on_exit(tunnel))

        print(tunnel)
        assert str(tunnel) == repr(tunnel)

        assert tunnel.here == here
        assert tunnel.there == there

        def access_dummy_server():
            return requests.get(
                "http://127.0.0.1:{local_port}".format(local_port=here))

        request = retry(access_dummy_server,
                        retries=5 * get_testing_process_count(),
                        seconds_between_retries=2)
        assert "text/html" in request.headers['Content-type']

        ssh_tunnel = node.tunnel_ssh()
        stack.enter_context(close_tunnel_on_exit(ssh_tunnel))

        assert str(ssh_tunnel) == repr(ssh_tunnel)
        assert str(ssh_tunnel).startswith("ssh ")
        assert user in str(ssh_tunnel)
        assert str(ssh_tunnel.here) in str(ssh_tunnel)
        assert ssh_tunnel.there == node.port

    assert not nodes.running()
    with pytest.raises(RuntimeError):
        nodes.wait()
    with pytest.raises(RuntimeError):
        node.tunnel(there=there, here=here)
Exemplo n.º 23
0
def test_basic():
    user = USER_1
    with ExitStack() as stack:
        stack.enter_context(disable_pytest_stdin())
        stack.enter_context(set_up_key_location(user))
        stack.enter_context(reset_environment(user))
        stack.enter_context(set_password(get_test_user_password(user)))

        clusters = show_clusters()
        print(clusters)

        assert len(clusters) == 1

        cluster = show_cluster(name=TEST_CLUSTER)
        print(cluster)

        assert clusters[TEST_CLUSTER] == cluster

        nodes = cluster.allocate_nodes(nodes=2,
                                       cores=1,
                                       memory_per_node=MiB(100),
                                       walltime=Walltime(minutes=30),
                                       native_args={'--partition': 'debug'})
        with cancel_on_exit(nodes):
            assert len(nodes) == 2
            assert nodes[0] in nodes
            print(nodes)
            assert str(nodes) == repr(nodes)

            nodes.wait(timeout=SLURM_WAIT_TIMEOUT)
            assert nodes.running()

            print(nodes)
            print(nodes[0])

            assert nodes[0].run('whoami') == user
            assert nodes[1].run('whoami') == user

        assert not nodes.running()
        with pytest.raises(RuntimeError):
            nodes.wait()
        with pytest.raises(RuntimeError):
            nodes[0].run('whoami')
Exemplo n.º 24
0
def test_dask_deployment_with_redeploy_on_validation_failure():
    user = USER_41
    with ExitStack() as stack:
        stack.enter_context(disable_pytest_stdin())
        stack.enter_context(set_up_key_location(user))
        stack.enter_context(reset_environment(user))
        stack.enter_context(set_password(get_test_user_password(user)))

        cluster = show_cluster(name=TEST_CLUSTER)
        nodes = cluster.allocate_nodes(nodes=2,
                                       cores=1,
                                       memory_per_node=MiB(100),
                                       walltime=Walltime(minutes=30))
        stack.enter_context(cancel_on_exit(nodes))

        assert idact.detail.dask.deploy_dask_impl.validate_worker
        stored_validate_worker = \
            idact.detail.dask.deploy_dask_impl.validate_worker

        fake_validation_counter = [0]

        # pylint: disable=unused-argument
        def fake_validate_worker(worker: DaskWorkerDeployment):
            current_count = fake_validation_counter[0]
            fake_validation_counter[0] = current_count + 1

            print("Fake worker validation.")
            if current_count == 0:
                raise RuntimeError("Fake worker validation: First node fail.")
            print("Deciding the worker is valid.")

        try:
            idact.detail.dask.deploy_dask_impl.validate_worker = \
                fake_validate_worker

            with deploy_dask_on_testing_cluster(nodes):
                pass

            assert fake_validation_counter[0] == 3

        finally:
            idact.detail.dask.deploy_dask_impl.validate_worker = \
                stored_validate_worker
Exemplo n.º 25
0
def test_dask_deployment_with_setup_actions():
    user = USER_18
    with ExitStack() as stack:
        stack.enter_context(disable_pytest_stdin())
        stack.enter_context(set_up_key_location(user))
        stack.enter_context(reset_environment(user))
        stack.enter_context(set_password(get_test_user_password(user)))

        cluster = show_cluster(name=TEST_CLUSTER)
        nodes = cluster.allocate_nodes(nodes=2,
                                       cores=1,
                                       memory_per_node=MiB(100),
                                       walltime=Walltime(minutes=30))
        stack.enter_context(cancel_on_exit(nodes))

        cluster.config.setup_actions.dask = ['echo ABC > file.txt',
                                             'mv file.txt file2.txt']
        with deploy_dask_on_testing_cluster(nodes) as node:
            assert node.run("cat file2.txt") == "ABC"
Exemplo n.º 26
0
def test_can_read_node_resources():
    user = USER_39
    with ExitStack() as stack:
        stack.enter_context(disable_pytest_stdin())
        stack.enter_context(set_up_key_location(user))
        stack.enter_context(reset_environment(user))
        stack.enter_context(set_password(get_test_user_password(user)))

        cluster = show_cluster(name=TEST_CLUSTER)

        access_node = cluster.get_access_node()
        print(str(access_node))
        assert str(access_node) == repr(access_node)

        assert access_node.resources.cpu_cores is None
        assert access_node.resources.memory_total is None
        start_stress_cpu(user=user, timeout=10)
        try:
            check_resources_in_believable_range(access_node.resources)
        finally:
            stop_stress_cpu(user=user)

        nodes = cluster.allocate_nodes(cores=1,
                                       memory_per_node=bitmath.GiB(0.8))

        assert len(nodes) == 1
        node = nodes[0]

        stack.enter_context(cancel_on_exit(nodes))

        nodes.wait(timeout=SLURM_WAIT_TIMEOUT)
        assert nodes.running()

        assert node.resources.cpu_cores == 1
        assert node.resources.memory_total == bitmath.GiB(0.8)
        start_stress_cpu(user=user, timeout=10)
        try:
            check_resources_in_believable_range(access_node.resources)
        finally:
            stop_stress_cpu(user=user)

        assert node.run('whoami') == user
Exemplo n.º 27
0
def test_clear_deployments():
    user = USER_46
    with ExitStack() as stack:
        stack.enter_context(disable_pytest_stdin())
        stack.enter_context(set_up_key_location(user))
        stack.enter_context(reset_environment(user))
        stack.enter_context(set_password(get_test_user_password(user)))
        stack.enter_context(clear_deployment_sync_data(user))

        cluster = show_cluster(name=TEST_CLUSTER)
        access_node = cluster.get_access_node()

        def check_deployments_file_exists():
            access_node.run("cat ~/.idact/.deployments")

        nodes = cluster.allocate_nodes()
        with cancel_on_exit(nodes):
            nodes.wait(timeout=SLURM_WAIT_TIMEOUT)
            assert nodes.running()

            with pytest.raises(RuntimeError):
                check_deployments_file_exists()

            cluster.push_deployment(deployment=nodes)

            check_deployments_file_exists()

            deployments = cluster.pull_deployments()
            assert len(deployments.nodes) == 1
            assert deployments.nodes[0].running()

            check_deployments_file_exists()

            cluster.clear_pushed_deployments()

            with pytest.raises(RuntimeError):
                check_deployments_file_exists()

            deployments = cluster.pull_deployments()
            assert not deployments.nodes
Exemplo n.º 28
0
def test_allocation_should_default_to_port_22_if_port_info_file_is_missing():
    user = USER_61
    with ExitStack() as stack:
        stack.enter_context(disable_pytest_stdin())
        stack.enter_context(set_up_key_location(user))
        stack.enter_context(reset_environment(user))
        stack.enter_context(set_password(get_test_user_password(user)))

        cluster = show_cluster(name=TEST_CLUSTER)

        node = cluster.get_access_node()
        nodes = cluster.allocate_nodes(memory_per_node=MiB(100))
        stack.enter_context(cancel_on_exit(nodes))

        retry(lambda: node.run("rm ~/.idact/sshd_ports/alloc-*/*"),
              retries=SLURM_WAIT_TIMEOUT,
              seconds_between_retries=1)

        nodes.wait(timeout=SLURM_WAIT_TIMEOUT)

        assert nodes.running()
        assert nodes[0].port == 22
Exemplo n.º 29
0
def test_allocate_defaults():
    user = USER_22
    with ExitStack() as stack:
        stack.enter_context(disable_pytest_stdin())
        stack.enter_context(set_up_key_location(user))
        stack.enter_context(reset_environment(user))
        stack.enter_context(set_password(get_test_user_password(user)))

        cluster = show_cluster(name=TEST_CLUSTER)

        nodes = cluster.allocate_nodes()
        stack.enter_context(cancel_on_exit(nodes))
        assert len(nodes) == 1
        node = nodes[0]

        nodes.wait(timeout=SLURM_WAIT_TIMEOUT)
        assert nodes.running()

        assert node.resources.cpu_cores == 1
        assert node.resources.memory_total == bitmath.GiB(1)
        print(node)

        assert node.run('whoami') == user
Exemplo n.º 30
0
 def cancel(self):
     with ExitStack() as stack:
         stack.enter_context(cancel_on_exit(self._scheduler))
         for worker in self._workers:
             stack.enter_context(cancel_on_exit(worker))