def create_node(self, node_config, tags, count): resource_group = self.provider_config["resource_group"] if self.cache_stopped_nodes: VALIDITY_TAGS = [ TAG_RAY_CLUSTER_NAME, TAG_RAY_NODE_KIND, TAG_RAY_LAUNCH_CONFIG, TAG_RAY_USER_NODE_TYPE, ] filters = {tag: tags[tag] for tag in VALIDITY_TAGS if tag in tags} reuse_nodes = self.stopped_nodes(filters)[:count] logger.info( f"Reusing nodes {list(reuse_nodes)}. " "To disable reuse, set `cache_stopped_nodes: False` " "under `provider` in the cluster configuration.", ) start = get_azure_sdk_function( client=self.compute_client.virtual_machines, function_name="start") for node_id in reuse_nodes: start(resource_group_name=resource_group, vm_name=node_id).wait() self.set_node_tags(node_id, tags) count -= len(reuse_nodes) if count: self._create_node(node_config, tags, count)
def set_node_tags(self, node_id, tags): """Sets the tag values (string dict) for the specified node.""" node_tags = self._get_cached_node(node_id)["tags"] node_tags.update(tags) update = get_azure_sdk_function( client=self.compute_client.virtual_machines, function_name="update") update(resource_group_name=self.provider_config["resource_group"], vm_name=node_id, parameters={"tags": node_tags}) self.cached_nodes[node_id]["tags"] = node_tags
def _create_node(self, node_config, tags, count): """Creates a number of nodes within the namespace.""" resource_group = self.provider_config["resource_group"] # load the template file current_path = Path(__file__).parent template_path = current_path.joinpath("azure-vm-template.json") with open(template_path, "r") as template_fp: template = json.load(template_fp) # get the tags config_tags = node_config.get("tags", {}).copy() config_tags.update(tags) config_tags[TAG_RAY_CLUSTER_NAME] = self.cluster_name name_tag = config_tags.get(TAG_RAY_NODE_NAME, "node") unique_id = uuid4().hex[:VM_NAME_UUID_LEN] vm_name = "{name}-{id}".format(name=name_tag, id=unique_id) use_internal_ips = self.provider_config.get("use_internal_ips", False) template_params = node_config["azure_arm_parameters"].copy() template_params["vmName"] = vm_name template_params["provisionPublicIp"] = not use_internal_ips template_params["vmTags"] = config_tags template_params["vmCount"] = count parameters = { "properties": { "mode": DeploymentMode.incremental, "template": template, "parameters": { key: { "value": value } for key, value in template_params.items() }, } } # TODO: we could get the private/public ips back directly create_or_update = get_azure_sdk_function( client=self.resource_client.deployments, function_name="create_or_update") create_or_update( resource_group_name=resource_group, deployment_name="ray-vm-{}".format(name_tag), parameters=parameters, ).wait()
def terminate_node(self, node_id): """Terminates the specified node. This will delete the VM and associated resources (NIC, IP, Storage) for the specified node.""" resource_group = self.provider_config["resource_group"] try: # get metadata for node metadata = self._get_node(node_id) except KeyError: # node no longer exists return # TODO: deallocate instead of delete to allow possible reuse # self.compute_client.virtual_machines.deallocate( # resource_group_name=resource_group, # vm_name=node_id) # gather disks to delete later vm = self.compute_client.virtual_machines.get( resource_group_name=resource_group, vm_name=node_id) disks = {d.name for d in vm.storage_profile.data_disks} disks.add(vm.storage_profile.os_disk.name) try: # delete machine, must wait for this to complete delete = get_azure_sdk_function( client=self.compute_client.virtual_machines, function_name="delete") delete(resource_group_name=resource_group, vm_name=node_id).wait() except Exception as e: logger.warning("Failed to delete VM: {}".format(e)) try: # delete nic delete = get_azure_sdk_function( client=self.network_client.network_interfaces, function_name="delete") delete(resource_group_name=resource_group, network_interface_name=metadata["nic_name"]) except Exception as e: logger.warning("Failed to delete nic: {}".format(e)) # delete ip address if "public_ip_name" in metadata: try: delete = get_azure_sdk_function( client=self.network_client.public_ip_addresses, function_name="delete") delete(resource_group_name=resource_group, public_ip_address_name=metadata["public_ip_name"]) except Exception as e: logger.warning("Failed to delete public ip: {}".format(e)) # delete disks for disk in disks: try: delete = get_azure_sdk_function( client=self.compute_client.disks, function_name="delete") delete(resource_group_name=resource_group, disk_name=disk) except Exception as e: logger.warning("Failed to delete disk: {}".format(e))
def terminate_node(self, node_id): """Terminates the specified node. This will delete the VM and associated resources (NIC, IP, Storage) for the specified node.""" resource_group = self.provider_config["resource_group"] try: # get metadata for node metadata = self._get_node(node_id) except KeyError: # node no longer exists return if self.cache_stopped_nodes: try: # stop machine and leave all resources logger.info(f"Stopping instance {node_id}" "(to fully terminate instead, " "set `cache_stopped_nodes: False` " "under `provider` in the cluster configuration)") stop = get_azure_sdk_function( client=self.compute_client.virtual_machines, function_name="deallocate", ) stop(resource_group_name=resource_group, vm_name=node_id) except Exception as e: logger.warning("Failed to stop VM: {}".format(e)) else: vm = self.compute_client.virtual_machines.get( resource_group_name=resource_group, vm_name=node_id) disks = {d.name for d in vm.storage_profile.data_disks} disks.add(vm.storage_profile.os_disk.name) try: # delete machine, must wait for this to complete delete = get_azure_sdk_function( client=self.compute_client.virtual_machines, function_name="delete") delete(resource_group_name=resource_group, vm_name=node_id).wait() except Exception as e: logger.warning("Failed to delete VM: {}".format(e)) try: # delete nic delete = get_azure_sdk_function( client=self.network_client.network_interfaces, function_name="delete", ) delete( resource_group_name=resource_group, network_interface_name=metadata["nic_name"], ) except Exception as e: logger.warning("Failed to delete nic: {}".format(e)) # delete ip address if "public_ip_name" in metadata: try: delete = get_azure_sdk_function( client=self.network_client.public_ip_addresses, function_name="delete", ) delete( resource_group_name=resource_group, public_ip_address_name=metadata["public_ip_name"], ) except Exception as e: logger.warning("Failed to delete public ip: {}".format(e)) # delete disks for disk in disks: try: delete = get_azure_sdk_function( client=self.compute_client.disks, function_name="delete") delete(resource_group_name=resource_group, disk_name=disk) except Exception as e: logger.warning("Failed to delete disk: {}".format(e))