def testGetOrCreateMultiNodeType(self):
     config = MULTI_WORKER_CLUSTER.copy()
     # Commenting out this line causes the test case to fail?!?!
     config["min_workers"] = 0
     config_path = self.write_config(config)
     config_path = self.write_config(MULTI_WORKER_CLUSTER)
     self.provider = MockProvider()
     runner = MockProcessRunner()
     get_or_create_head_node(MULTI_WORKER_CLUSTER,
                             config_path,
                             no_restart=False,
                             restart_only=False,
                             yes=True,
                             override_cluster_name=None,
                             _provider=self.provider,
                             _runner=runner)
     self.waitForNodes(1)
     runner.assert_has_call("1.2.3.4", "init_cmd")
     runner.assert_has_call("1.2.3.4", "setup_cmd")
     runner.assert_has_call("1.2.3.4", "start_ray_head")
     self.assertEqual(self.provider.mock_nodes[0].node_type, "empty_node")
     self.assertEqual(
         self.provider.mock_nodes[0].node_config.get("FooProperty"), 42)
     self.assertEqual(
         self.provider.mock_nodes[0].node_config.get("TestProp"), 1)
     self.assertEqual(
         self.provider.mock_nodes[0].tags.get(TAG_RAY_USER_NODE_TYPE),
         "empty_node")
    def testRequestBundlesAccountsForHeadNode(self):
        config = MULTI_WORKER_CLUSTER.copy()
        config["head_node_type"] = "p2.8xlarge"
        config["min_workers"] = 0
        config["max_workers"] = 50
        config_path = self.write_config(config)
        self.provider = MockProvider()
        self.provider.create_node({}, {
            TAG_RAY_USER_NODE_TYPE: "p2.8xlarge",
            TAG_RAY_NODE_KIND: "head"
        }, 1)
        runner = MockProcessRunner()
        autoscaler = StandardAutoscaler(config_path,
                                        LoadMetrics(),
                                        max_failures=0,
                                        process_runner=runner,
                                        update_interval_s=0)
        assert len(self.provider.non_terminated_nodes({})) == 1

        # These requests fit on the head node.
        autoscaler.update()
        self.waitForNodes(1)
        autoscaler.request_resources([{"CPU": 1}])
        autoscaler.update()
        self.waitForNodes(1)
        assert len(self.provider.mock_nodes) == 1
        autoscaler.request_resources([{"GPU": 8}])
        autoscaler.update()
        self.waitForNodes(1)

        # This request requires an additional worker node.
        autoscaler.request_resources([{"GPU": 8}] * 2)
        autoscaler.update()
        self.waitForNodes(2)
        assert self.provider.mock_nodes[1].node_type == "p2.8xlarge"
    def testScaleUpIgnoreUsed(self):
        config = MULTI_WORKER_CLUSTER.copy()
        # Commenting out this line causes the test case to fail?!?!
        config["min_workers"] = 0
        config["target_utilization_fraction"] = 1.0
        config_path = self.write_config(config)
        self.provider = MockProvider()
        self.provider.create_node({}, {
            TAG_RAY_NODE_KIND: "head",
            TAG_RAY_USER_NODE_TYPE: "p2.xlarge"
        }, 1)
        head_ip = self.provider.non_terminated_node_ips({})[0]
        self.provider.finish_starting_nodes()
        runner = MockProcessRunner()
        lm = LoadMetrics(local_ip=head_ip)
        autoscaler = StandardAutoscaler(config_path,
                                        lm,
                                        max_failures=0,
                                        process_runner=runner,
                                        update_interval_s=0)
        autoscaler.update()
        self.waitForNodes(1)
        lm.update(head_ip, {"CPU": 4, "GPU": 1}, {}, {})
        self.waitForNodes(1)

        lm.update(head_ip, {
            "CPU": 4,
            "GPU": 1
        }, {"GPU": 1}, {},
                  waiting_bundles=[{
                      "GPU": 1
                  }])
        autoscaler.update()
        self.waitForNodes(2)
        assert self.provider.mock_nodes[1].node_type == "p2.xlarge"
示例#4
0
def test_kubernetes_command_runner():
    fail_cmd = "fail command"
    process_runner = MockProcessRunner([fail_cmd])
    provider = MockProvider()
    provider.create_node({}, {}, 1)
    args = {
        "log_prefix": "prefix",
        "namespace": "namespace",
        "node_id": 0,
        "auth_config": auth_config,
        "process_runner": process_runner,
    }
    cmd_runner = KubernetesCommandRunner(**args)

    env_vars = {"var1": "quote between this \" and this", "var2": "123"}
    cmd_runner.run("echo helloo", environment_variables=env_vars)

    expected = [
        "kubectl",
        "-n",
        "namespace",
        "exec",
        "-it",
        "0",
        "--",
        "bash",
        "--login",
        "-c",
        "-i",
        """\'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (export var1=\'"\'"\'"quote between this \\" and this"\'"\'"\';export var2=\'"\'"\'"123"\'"\'"\';echo helloo)\'"""  # noqa: E501
    ]

    assert process_runner.calls[0] == " ".join(expected)
 def testScaleUpLoadMetrics(self):
     config = MULTI_WORKER_CLUSTER.copy()
     config["min_workers"] = 0
     config["max_workers"] = 50
     config_path = self.write_config(config)
     self.provider = MockProvider()
     runner = MockProcessRunner()
     lm = LoadMetrics()
     autoscaler = StandardAutoscaler(config_path,
                                     lm,
                                     max_failures=0,
                                     process_runner=runner,
                                     update_interval_s=0)
     assert len(self.provider.non_terminated_nodes({})) == 0
     autoscaler.update()
     self.waitForNodes(0)
     autoscaler.update()
     lm.update("1.2.3.4", {},
               True, {},
               True, {},
               waiting_bundles=[{
                   "GPU": 1
               }],
               infeasible_bundles=[{
                   "CPU": 16
               }])
     autoscaler.update()
     self.waitForNodes(2)
     nodes = {
         self.provider.mock_nodes[0].node_type,
         self.provider.mock_nodes[1].node_type
     }
     assert nodes == {"p2.xlarge", "m4.4xlarge"}
示例#6
0
def test_kubernetes_command_runner():
    process_runner = MockProcessRunner()
    provider = MockProvider()
    provider.create_node({}, {}, 1)
    args = {
        "log_prefix": "prefix",
        "namespace": "namespace",
        "node_id": 0,
        "auth_config": auth_config,
        "process_runner": process_runner,
    }
    cmd_runner = KubernetesCommandRunner(**args)

    env_vars = {"var1": "quote between this \" and this", "var2": "123"}
    cmd_runner.run("echo helloo", environment_variables=env_vars)

    expected = [
        "kubectl",
        "-n",
        "namespace",
        "exec",
        "-it",
        "0",
        "--",
        "bash",
        "--login",
        "-c",
        "-i",
        """\'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (export var1=\'"\'"\'"quote between this \\" and this"\'"\'"\';export var2=\'"\'"\'"123"\'"\'"\';echo helloo)\'"""  # noqa: E501
    ]

    # Much easier to debug this loop than the function call.
    for x, y in zip(process_runner.calls[0], expected):
        assert x == y
    process_runner.assert_has_call("1.2.3.4", exact=expected)
示例#7
0
 def testRequestBundles(self):
     config = MULTI_WORKER_CLUSTER.copy()
     config["min_workers"] = 0
     config["max_workers"] = 50
     config_path = self.write_config(config)
     self.provider = MockProvider()
     runner = MockProcessRunner()
     autoscaler = StandardAutoscaler(config_path,
                                     LoadMetrics(),
                                     max_failures=0,
                                     process_runner=runner,
                                     update_interval_s=0)
     assert len(self.provider.non_terminated_nodes({})) == 0
     autoscaler.update()
     self.waitForNodes(0)
     autoscaler.request_resources([{"CPU": 1}])
     autoscaler.update()
     self.waitForNodes(1)
     assert self.provider.mock_nodes[0].node_type == "m4.large"
     autoscaler.request_resources([{"GPU": 8}])
     autoscaler.update()
     self.waitForNodes(2)
     assert self.provider.mock_nodes[1].node_type == "p2.8xlarge"
     autoscaler.request_resources([{"CPU": 32}] * 4)
     autoscaler.update()
     self.waitForNodes(4)
     assert self.provider.mock_nodes[2].node_type == "m4.16xlarge"
     assert self.provider.mock_nodes[3].node_type == "m4.16xlarge"
示例#8
0
def test_rsync_without_exclude_and_filter():
    process_runner = MockProcessRunner()
    provider = MockProvider()
    provider.create_node({}, {}, 1)
    cluster_name = "cluster"
    args = {
        "log_prefix": "prefix",
        "node_id": 0,
        "provider": provider,
        "auth_config": auth_config,
        "cluster_name": cluster_name,
        "process_runner": process_runner,
        "use_internal_ip": False,
    }
    cmd_runner = SSHCommandRunner(**args)

    local_mount = "/home/ubuntu/base/mount/"
    remote_mount = "/root/protected_mount/"

    process_runner.respond_to_call("docker inspect -f", ["true"])
    cmd_runner.run_rsync_up(local_mount,
                            remote_mount,
                            options={
                                "docker_mount_if_possible": True,
                            })

    process_runner.assert_not_has_call("1.2.3.4", pattern="--exclude test")
    process_runner.assert_not_has_call("1.2.3.4",
                                       pattern="--filter dir-merge,- .ignore")
示例#9
0
def test_docker_shm_override(run_option_type):
    process_runner = MockProcessRunner()
    provider = MockProvider()
    provider.create_node({}, {}, 1)
    cluster_name = "cluster"

    docker_config = {
        "container_name": "container",
        "image": "rayproject/ray:latest",
        run_option_type: ["--shm-size=80g"]
    }
    args = {
        "log_prefix": "prefix",
        "node_id": 0,
        "provider": provider,
        "auth_config": auth_config,
        "cluster_name": cluster_name,
        "process_runner": process_runner,
        "use_internal_ip": False,
        "docker_config": docker_config,
    }
    cmd_runner = DockerCommandRunner(**args)

    process_runner.respond_to_call("json .Config.Env", 2 * ["[]"])
    cmd_runner.run_init(as_head=True, file_mounts={}, sync_run_yet=True)

    # Ensure the user-provided SHM size is used.
    process_runner.assert_has_call("1.2.3.4", pattern="--shm-size=80g")

    # Ensure that SHM auto detection is bypassed
    process_runner.assert_not_has_call("1.2.3.4", pattern="/proc/meminfo")
    def testScaleUpMinWorkers(self):
        config = copy.deepcopy(MULTI_WORKER_CLUSTER)
        config["min_workers"] = 2
        config["max_workers"] = 50
        config["idle_timeout_minutes"] = 1
        # Since config["min_workers"] > 1, the remaining worker is started
        # with the default worker node type.
        config["available_node_types"]["p2.8xlarge"]["min_workers"] = 1
        config_path = self.write_config(config)
        self.provider = MockProvider()
        runner = MockProcessRunner()
        lm = LoadMetrics()
        autoscaler = StandardAutoscaler(
            config_path,
            lm,
            max_failures=0,
            process_runner=runner,
            update_interval_s=0)
        assert len(self.provider.non_terminated_nodes({})) == 0
        autoscaler.update()
        self.waitForNodes(2)
        assert len(self.provider.mock_nodes) == 2
        assert {
            self.provider.mock_nodes[0].node_type,
            self.provider.mock_nodes[1].node_type
        } == {"p2.8xlarge", "m4.large"}
        self.provider.create_node({}, {
            TAG_RAY_USER_NODE_TYPE: "p2.8xlarge",
            TAG_RAY_NODE_KIND: NODE_KIND_WORKER
        }, 2)
        self.provider.create_node({}, {
            TAG_RAY_USER_NODE_TYPE: "m4.16xlarge",
            TAG_RAY_NODE_KIND: NODE_KIND_WORKER
        }, 2)
        assert len(self.provider.non_terminated_nodes({})) == 6
        # Make sure that after idle_timeout_minutes we don't kill idle
        # min workers.
        for node_id in self.provider.non_terminated_nodes({}):
            lm.last_used_time_by_ip[self.provider.internal_ip(node_id)] = -60
        autoscaler.update()
        self.waitForNodes(2)

        cnt = 0
        for id in self.provider.mock_nodes:
            if self.provider.mock_nodes[id].state == "running" or \
                    self.provider.mock_nodes[id].state == "pending":
                assert self.provider.mock_nodes[id].node_type in {
                    "p2.8xlarge", "m4.large"
                }
                cnt += 1
        assert cnt == 2
示例#11
0
 def testScaleUpMinSanity(self):
     config_path = self.write_config(MULTI_WORKER_CLUSTER)
     self.provider = MockProvider()
     runner = MockProcessRunner()
     autoscaler = StandardAutoscaler(config_path,
                                     LoadMetrics(),
                                     max_failures=0,
                                     process_runner=runner,
                                     update_interval_s=0)
     assert len(self.provider.non_terminated_nodes({})) == 0
     autoscaler.update()
     self.waitForNodes(2)
     autoscaler.update()
     self.waitForNodes(2)
示例#12
0
    def testManyActors(self):
        config = copy.deepcopy(SAMPLE_CLUSTER_CONFIG)
        config_path = self.write_config(config)
        self.provider = MockProvider()
        simulator = Simulator(config_path, self.provider)

        start_count = 0

        def start_callback():
            nonlocal start_count
            start_count += 1

        tasks = [
            Actor(
                duration=float("inf"),
                resources={"CPU": 1},
                start_callback=start_callback,
            ) for _ in range(5000)
        ]
        simulator.submit(tasks)

        time = 0
        while start_count < len(tasks):
            time = simulator.step()

        assert time < 650

        # Check event logs contain add/remove node events.
        assert any("Adding" in x
                   for x in simulator.autoscaler.event_summarizer.summary())
        assert any("Removing" in x
                   for x in simulator.autoscaler.event_summarizer.summary())
示例#13
0
    def testManyTasks(self):
        config = copy.deepcopy(SAMPLE_CLUSTER_CONFIG)
        config_path = self.write_config(config)
        self.provider = MockProvider()
        simulator = Simulator(config_path, self.provider)

        done_count = 0

        def done_callback():
            nonlocal done_count
            done_count += 1

        tasks = [
            Task(duration=200,
                 resources={"CPU": 1},
                 done_callback=done_callback) for _ in range(5000)
        ]
        simulator.submit(tasks)

        time = 0
        while done_count < len(tasks):
            time = simulator.step()

        assert time < 850
        # TODO (Alex): Not clear what's actually worth asserting here.
        assert simulator.node_costs()

        # Check event logs contain add/remove node events.
        assert any("Adding" in x
                   for x in simulator.autoscaler.event_summarizer.summary())
        assert any("Removing" in x
                   for x in simulator.autoscaler.event_summarizer.summary())
示例#14
0
    def testManyActors(self):
        # cli_logger.configure(log_style="record", verbosity=-1)
        config = copy.deepcopy(SAMPLE_CLUSTER_CONFIG)
        config_path = self.write_config(config)
        self.provider = MockProvider()
        simulator = Simulator(config_path, self.provider)

        start_count = 0

        def start_callback():
            nonlocal start_count
            start_count += 1

        tasks = [
            Actor(
                duration=float("inf"),
                resources={"CPU": 1},
                start_callback=start_callback,
            ) for _ in range(5000)
        ]
        simulator.submit(tasks)

        time = 0
        while start_count < len(tasks):
            time = simulator.step()

        assert time < 200
示例#15
0
    def testManyTasks(self):
        cli_logger.configure(log_style="record", verbosity=-1)
        config = copy.deepcopy(SAMPLE_CLUSTER_CONFIG)
        config_path = self.write_config(config)
        self.provider = MockProvider()
        simulator = Simulator(config_path, self.provider)

        done_count = 0

        def done_callback():
            nonlocal done_count
            done_count += 1

        tasks = [
            Task(duration=200,
                 resources={"CPU": 1},
                 done_callback=done_callback) for _ in range(5000)
        ]
        simulator.submit(tasks)

        time = 0
        while done_count < len(tasks):
            time = simulator.step()

        assert time < 400
    def testManyTasks(self):
        config = copy.deepcopy(SAMPLE_CLUSTER_CONFIG)
        config_path = self.write_config(config)
        self.provider = MockProvider()
        simulator = Simulator(config_path, self.provider)

        done_count = 0

        def done_callback():
            nonlocal done_count
            done_count += 1

        tasks = [
            Task(
                duration=200,
                resources={"CPU": 1},
                done_callback=done_callback) for _ in range(5000)
        ]
        simulator.submit(tasks)

        time = 0
        while done_count < len(tasks):
            time = simulator.step()

        assert time < 850
        # TODO (Alex): Not clear what's actually worth asserting here.
        assert simulator.node_costs()
 def testCommandPassing(self):
     t = "custom"
     config = MULTI_WORKER_CLUSTER.copy()
     config["available_node_types"]["p2.8xlarge"][
         "worker_setup_commands"] = ["new_worker_setup_command"]
     config["available_node_types"]["p2.xlarge"][
         "initialization_commands"] = ["new_worker_initialization_cmd"]
     config["available_node_types"]["p2.xlarge"]["resources"][t] = 1
     # Commenting out this line causes the test case to fail?!?!
     config["min_workers"] = 0
     config["max_workers"] = 10
     config_path = self.write_config(config)
     self.provider = MockProvider()
     runner = MockProcessRunner()
     autoscaler = StandardAutoscaler(
         config_path,
         LoadMetrics(),
         max_failures=0,
         process_runner=runner,
         update_interval_s=0)
     assert len(self.provider.non_terminated_nodes({})) == 0
     autoscaler.update()
     self.waitForNodes(0)
     autoscaler.request_resources([{"CPU": 1}])
     autoscaler.update()
     self.waitForNodes(1)
     assert self.provider.mock_nodes[0].node_type == "m4.large"
     autoscaler.request_resources([{"GPU": 8}])
     autoscaler.update()
     self.waitForNodes(2)
     assert self.provider.mock_nodes[1].node_type == "p2.8xlarge"
     autoscaler.request_resources([{"GPU": 1}] * 9)
     autoscaler.update()
     self.waitForNodes(3)
     assert self.provider.mock_nodes[2].node_type == "p2.xlarge"
     autoscaler.update()
     sleep(0.1)
     runner.assert_has_call(self.provider.mock_nodes[1].internal_ip,
                            "new_worker_setup_command")
     runner.assert_not_has_call(self.provider.mock_nodes[1].internal_ip,
                                "setup_cmd")
     runner.assert_not_has_call(self.provider.mock_nodes[1].internal_ip,
                                "worker_setup_cmd")
     runner.assert_has_call(self.provider.mock_nodes[2].internal_ip,
                            "new_worker_initialization_cmd")
     runner.assert_not_has_call(self.provider.mock_nodes[2].internal_ip,
                                "init_cmd")
示例#18
0
def test_docker_command_runner():
    process_runner = MockProcessRunner()
    provider = MockProvider()
    provider.create_node({}, {}, 1)
    cluster_name = "cluster"
    ssh_control_hash = hashlib.md5(cluster_name.encode()).hexdigest()
    ssh_user_hash = hashlib.md5(getuser().encode()).hexdigest()
    ssh_control_path = "/tmp/ray_ssh_{}/{}".format(ssh_user_hash[:10],
                                                   ssh_control_hash[:10])
    docker_config = {"container_name": "container"}
    args = {
        "log_prefix": "prefix",
        "node_id": 0,
        "provider": provider,
        "auth_config": auth_config,
        "cluster_name": cluster_name,
        "process_runner": process_runner,
        "use_internal_ip": False,
        "docker_config": docker_config,
    }
    cmd_runner = DockerCommandRunner(**args)
    assert len(process_runner.calls) == 0, "No calls should be made in ctor"

    env_vars = {"var1": "quote between this \" and this", "var2": "123"}
    cmd_runner.run("echo hello", environment_variables=env_vars)

    # This string is insane because there are an absurd number of embedded
    # quotes. While this is a ridiculous string, the escape behavior is
    # important and somewhat difficult to get right for environment variables.
    cmd = """'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (docker exec -it  container /bin/bash -c '"'"'bash --login -c -i '"'"'"'"'"'"'"'"'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (export var1='"'"'"'"'"'"'"'"'"'"'"'"'"'"'"'"'"'"'"'"'"'"'"'"'"'"'"quote between this \\" and this"'"'"'"'"'"'"'"'"'"'"'"'"'"'"'"'"'"'"'"'"'"'"'"'"'"'"';export var2='"'"'"'"'"'"'"'"'"'"'"'"'"'"'"'"'"'"'"'"'"'"'"'"'"'"'"123"'"'"'"'"'"'"'"'"'"'"'"'"'"'"'"'"'"'"'"'"'"'"'"'"'"'"';echo hello)'"'"'"'"'"'"'"'"''"'"' )'"""  # noqa: E501

    expected = [
        "ssh", "-tt", "-i", "8265.pem", "-o", "StrictHostKeyChecking=no", "-o",
        "UserKnownHostsFile=/dev/null", "-o", "IdentitiesOnly=yes", "-o",
        "ExitOnForwardFailure=yes", "-o", "ServerAliveInterval=5", "-o",
        "ServerAliveCountMax=3", "-o", "ControlMaster=auto", "-o",
        "ControlPath={}/%C".format(ssh_control_path), "-o",
        "ControlPersist=10s", "-o", "ConnectTimeout=120s", "[email protected]",
        "bash", "--login", "-c", "-i", cmd
    ]
    # Much easier to debug this loop than the function call.
    for x, y in zip(process_runner.calls[0], expected):
        print(f"expeted:\t{y}")
        print(f"actual: \t{x}")
        assert x == y
    process_runner.assert_has_call("1.2.3.4", exact=expected)
def test_get_nodes_to_launch_max_launch_concurrency():
    provider = MockProvider()
    new_types = copy.deepcopy(TYPES_A)
    new_types["p2.8xlarge"]["min_workers"] = 4
    new_types["p2.8xlarge"]["max_workers"] = 40

    scheduler = ResourceDemandScheduler(provider, new_types, 30)

    to_launch = scheduler.get_nodes_to_launch([], {}, [], {}, [])
    # Respects min_workers despite concurrency limitation.
    assert to_launch == {"p2.8xlarge": 4}

    provider.create_node({}, {
        TAG_RAY_USER_NODE_TYPE: "p2.8xlarge",
        TAG_RAY_NODE_STATUS: STATUS_UNINITIALIZED
    }, 1)
    nodes = provider.non_terminated_nodes({})
    # Trying to force here that the node shows in nodes but not connected yet
    # and hence does not show up in LoadMetrics (or utilizations).
    ips = provider.non_terminated_node_ips(
        {TAG_RAY_NODE_STATUS: STATUS_UP_TO_DATE})
    utilizations = {ip: {"GPU": 8} for ip in ips}
    launching_nodes = {"p2.8xlarge": 1}
    # requires 41 p2.8xls (currently 1 pending, 1 launching, 0 running}
    demands = [{"GPU": 8}] * (len(utilizations) + 40)
    to_launch = scheduler.get_nodes_to_launch(nodes, launching_nodes, demands,
                                              utilizations, [])
    # Enforces max launch to 5 when < 5 running. 2 are pending/launching.
    assert to_launch == {"p2.8xlarge": 3}

    provider.create_node({}, {
        TAG_RAY_USER_NODE_TYPE: "p2.8xlarge",
        TAG_RAY_NODE_STATUS: STATUS_UP_TO_DATE
    }, 8)
    nodes = provider.non_terminated_nodes({})
    ips = provider.non_terminated_node_ips(
        {TAG_RAY_NODE_STATUS: STATUS_UP_TO_DATE})
    utilizations = {ip: {"GPU": 8} for ip in ips}
    launching_nodes = {"p2.8xlarge": 1}
    # Requires additional 17 p2.8xls (now 1 pending, 1 launching, 8 running}
    demands = [{"GPU": 8}] * (len(utilizations) + 15)
    to_launch = scheduler.get_nodes_to_launch(nodes, launching_nodes, demands,
                                              utilizations, [])
    # We are allowed to launch up to 8 more since 8 are running.
    # We already have 2 pending/launching, so only 6 remain.
    assert to_launch == {"p2.8xlarge": 6}
示例#20
0
def test_kubernetes_command_runner():
    fail_cmd = "fail command"
    process_runner = MockProcessRunner([fail_cmd])
    provider = MockProvider()
    provider.create_node({}, {}, 1)
    args = {
        "log_prefix": "prefix",
        "namespace": "namespace",
        "node_id": 0,
        "auth_config": auth_config,
        "process_runner": process_runner,
    }
    cmd_runner = KubernetesCommandRunner(**args)

    env_vars = {"var1": "quote between this \" and this", "var2": "123"}
    cmd_runner.run("echo helloo", environment_variables=env_vars)

    expected = [
        "kubectl",
        "-n",
        "namespace",
        "exec",
        "-it",
        "0",
        "--",
        "bash",
        "--login",
        "-c",
        "-i",
        """\'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (export var1=\'"\'"\'"quote between this \\" and this"\'"\'"\';export var2=\'"\'"\'"123"\'"\'"\';echo helloo)\'"""  # noqa: E501
    ]

    assert process_runner.calls[0] == " ".join(expected)

    logger = logging.getLogger("ray.autoscaler._private.command_runner")
    with pytest.raises(SystemExit) as pytest_wrapped_e, patch.object(
            logger, "error") as mock_logger_error:
        cmd_runner.run(fail_cmd, exit_on_fail=True)

    failed_cmd_expected = f'prefixCommand failed: \n\n  kubectl -n namespace exec -it 0 --\'bash --login -c -i \'"\'"\'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && ({fail_cmd})\'"\'"\'\'\n'  # noqa: E501
    mock_logger_error.assert_called_once_with(failed_cmd_expected)
    assert pytest_wrapped_e.type == SystemExit
    assert pytest_wrapped_e.value.code == 1
 def testUpdateConfig(self):
     config = MULTI_WORKER_CLUSTER.copy()
     config_path = self.write_config(config)
     self.provider = MockProvider()
     runner = MockProcessRunner()
     autoscaler = StandardAutoscaler(config_path,
                                     LoadMetrics(),
                                     max_failures=0,
                                     process_runner=runner,
                                     update_interval_s=0)
     assert len(self.provider.non_terminated_nodes({})) == 0
     autoscaler.update()
     self.waitForNodes(2)
     config["min_workers"] = 0
     config["available_node_types"]["m4.large"]["node_config"][
         "field_changed"] = 1
     config_path = self.write_config(config)
     autoscaler.update()
     self.waitForNodes(0)
    def test_packing(self):
        provider = MockProvider()
        scheduler = ResourceDemandScheduler(provider, TYPES_A, 10)

        provider.create_node({}, {TAG_RAY_USER_NODE_TYPE: "p2.8xlarge"}, 1)
        # At this point our cluster has 1 p2.8xlarge instances (8 GPUs) and is
        # fully idle.
        nodes = provider.non_terminated_nodes({})

        resource_demands = [{"GPU": 1}] * 2
        pending_placement_groups = [
            PlacementGroupTableData(
                state=PlacementGroupTableData.PENDING,
                strategy=PlacementStrategy.STRICT_PACK,
                bundles=[Bundle(unit_resources={"GPU": 2})] * 3),
        ]
        # The 2 resource demand gpus should still be packed onto the same node
        # as the 6 GPU placement group.
        to_launch = scheduler.get_nodes_to_launch(nodes, {}, resource_demands,
                                                  {}, pending_placement_groups)
        assert to_launch == {}
    def test_strategies(self):
        provider = MockProvider()
        scheduler = ResourceDemandScheduler(provider, TYPES_A, 10)

        provider.create_node({}, {TAG_RAY_USER_NODE_TYPE: "p2.8xlarge"}, 2)
        # At this point our cluster has 2 p2.8xlarge instances (16 GPUs) and is
        # fully idle.
        nodes = provider.non_terminated_nodes({})

        resource_demands = [{"GPU": 4}] * 2
        pending_placement_groups = [
            # Requires a new node (only uses 2 GPUs on it though).
            PlacementGroupTableData(state=PlacementGroupTableData.PENDING,
                                    strategy=PlacementStrategy.STRICT_SPREAD,
                                    bundles=[
                                        Bundle(unit_resources={"GPU": 2}),
                                        Bundle(unit_resources={"GPU": 2}),
                                        Bundle(unit_resources={"GPU": 2})
                                    ]),
            # Requires a new node (uses the whole node).
            PlacementGroupTableData(
                state=PlacementGroupTableData.PENDING,
                strategy=PlacementStrategy.STRICT_PACK,
                bundles=([Bundle(unit_resources={"GPU": 2})] * 4)),
            # Fits across the machines that strict spread.
            PlacementGroupTableData(
                # runs on.
                state=PlacementGroupTableData.PENDING,
                strategy=PlacementStrategy.PACK,
                bundles=([Bundle(unit_resources={"GPU": 2})] * 2)),
            # Fits across the machines that strict spread.
            PlacementGroupTableData(
                # runs on.
                state=PlacementGroupTableData.PENDING,
                strategy=PlacementStrategy.SPREAD,
                bundles=([Bundle(unit_resources={"GPU": 2})] * 2)),
        ]
        to_launch = scheduler.get_nodes_to_launch(nodes, {}, resource_demands,
                                                  {}, pending_placement_groups)
        assert to_launch == {"p2.8xlarge": 2}
def test_get_nodes_to_launch_with_min_workers_and_bin_packing():
    provider = MockProvider()
    new_types = copy.deepcopy(TYPES_A)
    new_types["p2.8xlarge"]["min_workers"] = 2
    scheduler = ResourceDemandScheduler(provider, new_types, 10)

    provider.create_node({}, {TAG_RAY_USER_NODE_TYPE: "p2.8xlarge"}, 1)

    nodes = provider.non_terminated_nodes({})

    ips = provider.non_terminated_node_ips({})
    # 1 free p2.8xls
    utilizations = {ip: {"GPU": 8} for ip in ips}
    # 1 more on the way
    pending_nodes = {"p2.8xlarge": 1}
    # requires 2 p2.8xls (only 2 are in cluster/pending) and 1 p2.xlarge
    demands = [{"GPU": 8}] * (len(utilizations) + 1) + [{"GPU": 1}]
    to_launch = scheduler.get_nodes_to_launch(nodes, pending_nodes, demands,
                                              utilizations, [])
    assert to_launch == {"p2.xlarge": 1}

    # 3 min_workers of p2.8xlarge covers the 2 p2.8xlarge + 1 p2.xlarge demand.
    # 2 p2.8xlarge are running/pending. So we need 1 more p2.8xlarge only to
    # meet the min_workers constraint and the demand.
    new_types["p2.8xlarge"]["min_workers"] = 3
    scheduler = ResourceDemandScheduler(provider, new_types, 10)
    to_launch = scheduler.get_nodes_to_launch(nodes, pending_nodes, demands,
                                              utilizations, [])
    # Make sure it does not return [("p2.8xlarge", 1), ("p2.xlarge", 1)]
    assert to_launch == {"p2.8xlarge": 1}
    def testResourcePassing(self):
        config = MULTI_WORKER_CLUSTER.copy()
        config["min_workers"] = 0
        config["max_workers"] = 50
        config_path = self.write_config(config)
        self.provider = MockProvider()
        runner = MockProcessRunner()
        autoscaler = StandardAutoscaler(
            config_path,
            LoadMetrics(),
            max_failures=0,
            process_runner=runner,
            update_interval_s=0)
        assert len(self.provider.non_terminated_nodes({})) == 0
        autoscaler.update()
        self.waitForNodes(0)
        autoscaler.request_resources([{"CPU": 1}])
        autoscaler.update()
        self.waitForNodes(1)
        assert self.provider.mock_nodes[0].node_type == "m4.large"
        autoscaler.request_resources([{"GPU": 8}])
        autoscaler.update()
        self.waitForNodes(2)
        assert self.provider.mock_nodes[1].node_type == "p2.8xlarge"

        # TODO (Alex): Autoscaler creates the node during one update then
        # starts the updater in the enxt update. The sleep is largely
        # unavoidable because the updater runs in its own thread and we have no
        # good way of ensuring that the commands are sent in time.
        autoscaler.update()
        sleep(0.1)

        # These checks are done separately because we have no guarantees on the
        # order the dict is serialized in.
        runner.assert_has_call("172.0.0.0", "RAY_OVERRIDE_RESOURCES=")
        runner.assert_has_call("172.0.0.0", "\"CPU\":2")
        runner.assert_has_call("172.0.0.1", "RAY_OVERRIDE_RESOURCES=")
        runner.assert_has_call("172.0.0.1", "\"CPU\":32")
        runner.assert_has_call("172.0.0.1", "\"GPU\":8")
    def test_many_strict_spreads(self):
        provider = MockProvider()
        scheduler = ResourceDemandScheduler(provider, TYPES_A, 10)

        provider.create_node({}, {TAG_RAY_USER_NODE_TYPE: "p2.8xlarge"}, 2)
        # At this point our cluster has 2 p2.8xlarge instances (16 GPUs) and is
        # fully idle.
        nodes = provider.non_terminated_nodes({})

        resource_demands = [{"GPU": 1}] * 6
        pending_placement_groups = [
            # Requires a new node (only uses 2 GPUs on it though).
            PlacementGroupTableData(
                state=PlacementGroupTableData.PENDING,
                strategy=PlacementStrategy.STRICT_SPREAD,
                bundles=[Bundle(unit_resources={"GPU": 2})] * 3),
        ]
        # Each placement group will take up 2 GPUs per node, but the distinct
        # placement groups should still reuse the same nodes.
        pending_placement_groups = pending_placement_groups * 3
        to_launch = scheduler.get_nodes_to_launch(nodes, {}, resource_demands,
                                                  {}, pending_placement_groups)
        assert to_launch == {"p2.8xlarge": 1}
示例#27
0
 def testGetOrCreateMultiNodeType(self):
     config_path = self.write_config(MULTI_WORKER_CLUSTER)
     self.provider = MockProvider()
     runner = MockProcessRunner()
     get_or_create_head_node(MULTI_WORKER_CLUSTER,
                             config_path,
                             no_restart=False,
                             restart_only=False,
                             yes=True,
                             override_cluster_name=None,
                             _provider=self.provider,
                             _runner=runner)
     self.waitForNodes(1)
     runner.assert_has_call("1.2.3.4", "init_cmd")
     runner.assert_has_call("1.2.3.4", "head_setup_cmd")
     runner.assert_has_call("1.2.3.4", "start_ray_head")
     self.assertEqual(self.provider.mock_nodes[0].node_type, "m4.large")
     self.assertEqual(
         self.provider.mock_nodes[0].node_config.get("FooProperty"), 42)
     self.assertEqual(
         self.provider.mock_nodes[0].node_config.get("TestProp"), 1)
     self.assertEqual(
         self.provider.mock_nodes[0].tags.get(TAG_RAY_USER_NODE_TYPE),
         "m4.large")
def test_get_nodes_to_launch_limits():
    provider = MockProvider()
    scheduler = ResourceDemandScheduler(provider, TYPES_A, 3)

    provider.create_node({}, {TAG_RAY_USER_NODE_TYPE: "p2.8xlarge"}, 2)

    nodes = provider.non_terminated_nodes({})

    ips = provider.non_terminated_node_ips({})
    utilizations = {ip: {"GPU": 8} for ip in ips}

    to_launch = scheduler.get_nodes_to_launch(nodes, {"p2.8xlarge": 1}, [{
        "GPU": 8
    }] * 2, utilizations)
    assert to_launch == {}
def test_get_nodes_to_launch_with_min_workers():
    provider = MockProvider()
    new_types = copy.deepcopy(TYPES_A)
    new_types["p2.8xlarge"]["min_workers"] = 2
    scheduler = ResourceDemandScheduler(provider, new_types, 3)

    provider.create_node({}, {TAG_RAY_USER_NODE_TYPE: "p2.8xlarge"}, 1)

    nodes = provider.non_terminated_nodes({})

    ips = provider.non_terminated_node_ips({})
    utilizations = {ip: {"GPU": 8} for ip in ips}

    to_launch = scheduler.get_nodes_to_launch(nodes, {}, [{
        "GPU": 8
    }], utilizations, [])
    assert to_launch == {"p2.8xlarge": 1}
def test_calculate_node_resources():
    provider = MockProvider()
    scheduler = ResourceDemandScheduler(provider, TYPES_A, 10)

    provider.create_node({}, {TAG_RAY_USER_NODE_TYPE: "p2.8xlarge"}, 2)

    nodes = provider.non_terminated_nodes({})

    ips = provider.non_terminated_node_ips({})
    # 2 free p2.8xls
    utilizations = {ip: {"GPU": 8} for ip in ips}
    # 1 more on the way
    pending_nodes = {"p2.8xlarge": 1}
    # requires 4 p2.8xls (only 3 are in cluster/pending)
    demands = [{"GPU": 8}] * (len(utilizations) + 2)
    to_launch = scheduler.get_nodes_to_launch(nodes, pending_nodes, demands,
                                              utilizations, [])

    assert to_launch == {"p2.8xlarge": 1}