def __init__(self, provider_config, cluster_name): NodeProvider.__init__(self, provider_config, cluster_name) if cluster_name: lock_path = get_lock_path(cluster_name) state_path = get_state_path(cluster_name) self.state = ClusterState( lock_path, state_path, provider_config, ) self.use_coordinator = False else: # LocalNodeProvider with a coordinator server. self.state = OnPremCoordinatorState( "/tmp/coordinator.lock", "/tmp/coordinator.state", provider_config["list_of_node_ips"]) self.use_coordinator = True
def testClusterStateInit(self): """Check ClusterState __init__ func generates correct state file. Test the general use case and if num_workers increase/decrease. """ # Use a random head_ip so that the state file is regenerated each time # this test is run. (Otherwise the test will fail spuriously when run a # second time.) self._monkeypatch.setenv("RAY_TMPDIR", self._tmpdir) # ensure that a new cluster can start up if RAY_TMPDIR doesn't exist yet assert not os.path.exists(get_ray_temp_dir()) head_ip = ".".join(str(random.randint(0, 255)) for _ in range(4)) cluster_config = { "cluster_name": "random_name", "min_workers": 0, "max_workers": 0, "provider": { "type": "local", "head_ip": head_ip, "worker_ips": ["0.0.0.0:1"], "external_head_ip": "0.0.0.0.3", }, } provider_config = cluster_config["provider"] node_provider = _get_node_provider( provider_config, cluster_config["cluster_name"], use_cache=False ) assert os.path.exists(get_ray_temp_dir()) assert node_provider.external_ip(head_ip) == "0.0.0.0.3" assert isinstance(node_provider, LocalNodeProvider) expected_workers = {} expected_workers[provider_config["head_ip"]] = { "tags": {TAG_RAY_NODE_KIND: NODE_KIND_HEAD}, "state": "terminated", "external_ip": "0.0.0.0.3", } expected_workers[provider_config["worker_ips"][0]] = { "tags": {TAG_RAY_NODE_KIND: NODE_KIND_WORKER}, "state": "terminated", } state_save_path = local_config.get_state_path(cluster_config["cluster_name"]) assert os.path.exists(state_save_path) workers = json.loads(open(state_save_path).read()) assert workers == expected_workers # Test removing workers updates the cluster state. del expected_workers[provider_config["worker_ips"][0]] removed_ip = provider_config["worker_ips"].pop() node_provider = _get_node_provider( provider_config, cluster_config["cluster_name"], use_cache=False ) workers = json.loads(open(state_save_path).read()) assert workers == expected_workers # Test adding back workers updates the cluster state. expected_workers[removed_ip] = { "tags": {TAG_RAY_NODE_KIND: NODE_KIND_WORKER}, "state": "terminated", } provider_config["worker_ips"].append(removed_ip) node_provider = _get_node_provider( provider_config, cluster_config["cluster_name"], use_cache=False ) workers = json.loads(open(state_save_path).read()) assert workers == expected_workers # Test record_local_head_state_if_needed head_ip = cluster_config["provider"]["head_ip"] cluster_name = cluster_config["cluster_name"] node_provider = _get_node_provider( provider_config, cluster_config["cluster_name"], use_cache=False ) assert head_ip not in node_provider.non_terminated_nodes({}) record_local_head_state_if_needed(node_provider) assert head_ip in node_provider.non_terminated_nodes({}) expected_head_tags = { TAG_RAY_NODE_KIND: NODE_KIND_HEAD, TAG_RAY_USER_NODE_TYPE: local_config.LOCAL_CLUSTER_NODE_TYPE, TAG_RAY_NODE_NAME: "ray-{}-head".format(cluster_name), TAG_RAY_NODE_STATUS: STATUS_UP_TO_DATE, } assert node_provider.node_tags(head_ip) == expected_head_tags # Repeat and verify nothing has changed. record_local_head_state_if_needed(node_provider) assert head_ip in node_provider.non_terminated_nodes({}) assert node_provider.node_tags(head_ip) == expected_head_tags
def testClusterStateInit(self): """Check ClusterState __init__ func generates correct state file. Test the general use case and if num_workers increase/decrease. """ cluster_config = { "cluster_name": "random_name", "min_workers": 0, "max_workers": 0, "provider": { "type": "local", "head_ip": "0.0.0.0:2", "worker_ips": ["0.0.0.0:1"], "external_head_ip": "0.0.0.0.3" }, } provider_config = cluster_config["provider"] node_provider = _get_node_provider( provider_config, cluster_config["cluster_name"], use_cache=False) assert node_provider.external_ip("0.0.0.0:2") == "0.0.0.0.3" assert isinstance(node_provider, LocalNodeProvider) expected_workers = {} expected_workers[provider_config["head_ip"]] = { "tags": { TAG_RAY_NODE_KIND: NODE_KIND_HEAD }, "state": "terminated", "external_ip": "0.0.0.0.3" } expected_workers[provider_config["worker_ips"][0]] = { "tags": { TAG_RAY_NODE_KIND: NODE_KIND_WORKER }, "state": "terminated", } state_save_path = local_config.get_state_path( cluster_config["cluster_name"]) assert os.path.exists(state_save_path) workers = json.loads(open(state_save_path).read()) assert workers == expected_workers # Test removing workers updates the cluster state. del expected_workers[provider_config["worker_ips"][0]] removed_ip = provider_config["worker_ips"].pop() node_provider = _get_node_provider( provider_config, cluster_config["cluster_name"], use_cache=False) workers = json.loads(open(state_save_path).read()) assert workers == expected_workers # Test adding back workers updates the cluster state. expected_workers[removed_ip] = { "tags": { TAG_RAY_NODE_KIND: NODE_KIND_WORKER }, "state": "terminated", } provider_config["worker_ips"].append(removed_ip) node_provider = _get_node_provider( provider_config, cluster_config["cluster_name"], use_cache=False) workers = json.loads(open(state_save_path).read()) assert workers == expected_workers