def __init__(self, provider_config, cluster_name): NodeProvider.__init__(self, provider_config, cluster_name) self.cache_stopped_nodes = provider_config.get("cache_stopped_nodes", True) aws_credentials = provider_config.get("aws_credentials") self.ec2 = make_ec2_client( region=provider_config["region"], max_retries=BOTO_MAX_RETRIES, aws_credentials=aws_credentials) self.ec2_fail_fast = make_ec2_client( region=provider_config["region"], max_retries=0, aws_credentials=aws_credentials) # Try availability zones round-robin, starting from random offset self.subnet_idx = random.randint(0, 100) # Tags that we believe to actually be on EC2. self.tag_cache = {} # Tags that we will soon upload. self.tag_cache_pending = defaultdict(dict) # Number of threads waiting for a batched tag update. self.batch_thread_count = 0 self.batch_update_done = threading.Event() self.batch_update_done.set() self.ready_for_new_batch = threading.Event() self.ready_for_new_batch.set() self.tag_cache_lock = threading.Lock() self.count_lock = threading.Lock() # Cache of node objects from the last nodes() call. This avoids # excessive DescribeInstances requests. self.cached_nodes = {}
def mock_init(self, provider_config, cluster_name): # Adds an attribute to detect if the provider has created the head. NodeProvider.__init__(self, provider_config, cluster_name) self.cluster_name = cluster_name self.namespace = provider_config["namespace"] self._head_created = False
def __init__(self, provider_config, cluster_name): NodeProvider.__init__(self, provider_config, cluster_name) self.cache_stopped_nodes = provider_config.get("cache_stopped_nodes", True) aws_credentials = provider_config.get("aws_credentials") self.ec2 = make_ec2_client(region=provider_config["region"], max_retries=BOTO_MAX_RETRIES, aws_credentials=aws_credentials) self.ec2_fail_fast = make_ec2_client(region=provider_config["region"], max_retries=0, aws_credentials=aws_credentials) # Try availability zones round-robin, starting from random offset self.subnet_idx = random.randint(0, 100) self.tag_cache = {} # Tags that we believe to actually be on EC2. self.tag_cache_pending = {} # Tags that we will soon upload. self.tag_cache_lock = threading.Lock() self.tag_cache_update_event = threading.Event() self.tag_cache_kill_event = threading.Event() self.tag_update_thread = threading.Thread( target=self._node_tag_update_loop) self.tag_update_thread.start() # Cache of node objects from the last nodes() call. This avoids # excessive DescribeInstances requests. self.cached_nodes = {}
def __init__(self, provider_config, cluster_name): NodeProvider.__init__(self, provider_config, cluster_name) kwargs = {} if "subscription_id" in provider_config: kwargs["subscription_id"] = provider_config["subscription_id"] try: self.compute_client = get_client_from_cli_profile( client_class=ComputeManagementClient, **kwargs) self.network_client = get_client_from_cli_profile( client_class=NetworkManagementClient, **kwargs) self.resource_client = get_client_from_cli_profile( client_class=ResourceManagementClient, **kwargs) except CLIError as e: if str(e) != "Please run 'az login' to setup account.": raise else: logger.info("CLI profile authentication failed. Trying MSI") credentials = MSIAuthentication() self.compute_client = ComputeManagementClient( credentials=credentials, **kwargs) self.network_client = NetworkManagementClient( credentials=credentials, **kwargs) self.resource_client = ResourceManagementClient( credentials=credentials, **kwargs) self.lock = RLock() # cache node objects self.cached_nodes = {}
def __init__(self, provider_config: dict, cluster_name: str): NodeProvider.__init__(self, provider_config, cluster_name) self.lock = RLock() self._construct_clients() # Cache of node objects from the last nodes() call. This avoids # excessive DescribeInstances requests. self.cached_nodes: Dict[str, GCPNode] = {}
def __init__(self, provider_config, cluster_name): NodeProvider.__init__(self, provider_config, cluster_name) self.lock = RLock() self.compute = discovery.build("compute", "v1") # Cache of node objects from the last nodes() call. This avoids # excessive DescribeInstances requests. self.cached_nodes = {}
def __init__(self, provider_config, cluster_name): NodeProvider.__init__(self, provider_config, cluster_name) self.lock = RLock() _, _, self.compute = construct_clients_from_provider_config( provider_config) # Cache of node objects from the last nodes() call. This avoids # excessive DescribeInstances requests. self.cached_nodes = {}
def __init__(self, provider_config, cluster_name): NodeProvider.__init__(self, provider_config, cluster_name) self.ec2 = boto3.resource("ec2", region_name=provider_config["region"]) # Cache of node objects from the last nodes() call. This avoids # excessive DescribeInstances requests. self.cached_nodes = {} # Cache of ip lookups. We assume IPs never change once assigned. self.internal_ip_cache = {} self.external_ip_cache = {}
def __init__(self, provider_config, cluster_name): NodeProvider.__init__(self, provider_config, cluster_name) self.compute = discovery.build("compute", "v1") # Cache of node objects from the last nodes() call. This avoids # excessive DescribeInstances requests. self.cached_nodes = {} # Cache of ip lookups. We assume IPs never change once assigned. self.internal_ip_cache = {} self.external_ip_cache = {}
def __init__(self, provider_config, cluster_name): NodeProvider.__init__(self, provider_config, cluster_name) self.lock = RLock() gcp_credentials = fetch_gcp_credentials_from_provider_config( provider_config) self.compute = _create_compute(gcp_credentials) # Cache of node objects from the last nodes() call. This avoids # excessive DescribeInstances requests. self.cached_nodes = {}
def __init__(self, provider_config, cluster_name): NodeProvider.__init__(self, provider_config, cluster_name) self.__cached = {} self.__star = Staroid(access_token=provider_config["access_token"], account=provider_config["account"]) self.__ske = self._get_config_or_env(provider_config, "ske", "STAROID_SKE") self.__ske_region = self._get_config_or_env(provider_config, "ske_region", "STAROID_SKE_REGION")
def __init__(self, provider_config, cluster_name): NodeProvider.__init__(self, provider_config, cluster_name) config = Config(retries=dict(max_attempts=BOTO_MAX_RETRIES)) self.ec2 = boto3.resource( "ec2", region_name=provider_config["region"], config=config) # Cache of node objects from the last nodes() call. This avoids # excessive DescribeInstances requests. self.cached_nodes = {} # Cache of ip lookups. We assume IPs never change once assigned. self.internal_ip_cache = {} self.external_ip_cache = {}
def __init__(self, provider_config, cluster_name): NodeProvider.__init__(self, provider_config, cluster_name) if cluster_name: self.state = ClusterState( "/tmp/cluster-{}.lock".format(cluster_name), "/tmp/cluster-{}.state".format(cluster_name), provider_config, ) self.use_coordinator = False else: # LocalNodeProvider with a coordinator server. self.state = OnPremCoordinatorState( "/tmp/coordinator.lock", "/tmp/coordinator.state", provider_config["list_of_node_ips"]) self.use_coordinator = True
def __init__(self, provider_config, cluster_name): NodeProvider.__init__(self, provider_config, cluster_name) config = Config(retries={'max_attempts': BOTO_MAX_RETRIES}) self.ec2 = boto3.resource( "ec2", region_name=provider_config["region"], config=config) # Try availability zones round-robin, starting from random offset self.subnet_idx = random.randint(0, 100) # Cache of node objects from the last nodes() call. This avoids # excessive DescribeInstances requests. self.cached_nodes = {} # Cache of ip lookups. We assume IPs never change once assigned. self.internal_ip_cache = {} self.external_ip_cache = {}
def __init__(self, provider_config, cluster_name): NodeProvider.__init__(self, provider_config, cluster_name) subscription_id = provider_config["subscription_id"] credential = DefaultAzureCredential( exclude_shared_token_cache_credential=True) self.compute_client = ComputeManagementClient(credential, subscription_id) self.network_client = NetworkManagementClient(credential, subscription_id) self.resource_client = ResourceManagementClient( credential, subscription_id) self.lock = RLock() # cache node objects self.cached_nodes = {}
def __init__(self, provider_config, cluster_name): NodeProvider.__init__(self, provider_config, cluster_name) config = Config(retries={'max_attempts': BOTO_MAX_RETRIES}) self.ec2 = boto3.resource("ec2", region_name=provider_config["region"], config=config) # Try availability zones round-robin, starting from random offset self.subnet_idx = random.randint(0, 100) # Cache of node objects from the last nodes() call. This avoids # excessive DescribeInstances requests. self.cached_nodes = {} # Cache of ip lookups. We assume IPs never change once assigned. self.internal_ip_cache = {} self.external_ip_cache = {}
def __init__(self, provider_config, cluster_name): NodeProvider.__init__(self, provider_config, cluster_name) if "RAY_FAKE_CLUSTER" not in os.environ: raise RuntimeError( "FakeMultiNodeProvider requires ray to be started with " "RAY_FAKE_CLUSTER=1 ray start ...") self._nodes = { FAKE_HEAD_NODE_ID: { "tags": { TAG_RAY_NODE_KIND: NODE_KIND_HEAD, TAG_RAY_USER_NODE_TYPE: FAKE_HEAD_NODE_TYPE, TAG_RAY_NODE_NAME: FAKE_HEAD_NODE_ID, TAG_RAY_NODE_STATUS: STATUS_UP_TO_DATE, } }, } self._next_node_id = 0
def __init__(self, provider_config, cluster_name): NodeProvider.__init__(self, provider_config, cluster_name) if cluster_name: lock_path = get_lock_path(cluster_name) state_path = get_state_path(cluster_name) self.state = ClusterState( lock_path, state_path, provider_config, ) self.use_coordinator = False else: # LocalNodeProvider with a coordinator server. self.state = OnPremCoordinatorState( "/tmp/coordinator.lock", "/tmp/coordinator.state", provider_config["list_of_node_ips"]) self.use_coordinator = True
def __init__(self, provider_config, cluster_name): NodeProvider.__init__(self, provider_config, cluster_name) config = Config(retries={"max_attempts": BOTO_MAX_RETRIES}) self.ec2 = boto3.resource( "ec2", region_name=provider_config["region"], config=config) # Try availability zones round-robin, starting from random offset self.subnet_idx = random.randint(0, 100) self.tag_cache = {} # Tags that we believe to actually be on EC2. self.tag_cache_pending = {} # Tags that we will soon upload. self.tag_cache_lock = threading.Lock() self.tag_cache_update_event = threading.Event() self.tag_cache_kill_event = threading.Event() self.tag_update_thread = threading.Thread( target=self._node_tag_update_loop) self.tag_update_thread.start() # Cache of node objects from the last nodes() call. This avoids # excessive DescribeInstances requests. self.cached_nodes = {}
def __init__(self, provider_config, cluster_name): NodeProvider.__init__(self, provider_config, cluster_name) kwargs = {} if "subscription_id" in provider_config: kwargs["subscription_id"] = provider_config["subscription_id"] try: self.compute_client = get_client_from_cli_profile( client_class=ComputeManagementClient, **kwargs) self.network_client = get_client_from_cli_profile( client_class=NetworkManagementClient, **kwargs) except Exception: logger.info("CLI profile authentication failed. Trying MSI", exc_info=True) credentials = MSIAuthentication() self.compute_client = ComputeManagementClient( credentials=credentials, **kwargs) self.network_client = NetworkManagementClient( credentials=credentials, **kwargs) self.lock = RLock() # cache node objects self.cached_nodes = {}
def _set_up_config_for_head_node(config: Dict[str, Any], provider: NodeProvider, no_restart: bool) ->\ Tuple[Dict[str, Any], Any]: """Prepares autoscaling config and, if needed, ssh key, to be mounted onto the Ray head node for use by the autoscaler. Returns the modified config and the temporary config file that will be mounted onto the head node. """ # Rewrite the auth config so that the head # node can update the workers remote_config = copy.deepcopy(config) # drop proxy options if they exist, otherwise # head node won't be able to connect to workers remote_config["auth"].pop("ssh_proxy_command", None) if "ssh_private_key" in config["auth"]: remote_key_path = "~/ray_bootstrap_key.pem" remote_config["auth"]["ssh_private_key"] = remote_key_path # Adjust for new file locations new_mounts = {} for remote_path in config["file_mounts"]: new_mounts[remote_path] = remote_path remote_config["file_mounts"] = new_mounts remote_config["no_restart"] = no_restart remote_config = provider.prepare_for_head_node(remote_config) # Now inject the rewritten config and SSH key into the head node remote_config_file = tempfile.NamedTemporaryFile( "w", prefix="ray-bootstrap-") remote_config_file.write(json.dumps(remote_config)) remote_config_file.flush() config["file_mounts"].update({ "~/ray_bootstrap_config.yaml": remote_config_file.name }) if "ssh_private_key" in config["auth"]: config["file_mounts"].update({ remote_key_path: config["auth"]["ssh_private_key"], }) return config, remote_config_file
def __init__(self, provider_config, cluster_name): NodeProvider.__init__(self, provider_config, cluster_name) self.state = ClusterState("/tmp/cluster-{}.lock".format(cluster_name), "/tmp/cluster-{}.state".format(cluster_name), provider_config)
def __init__(self, provider_config, cluster_name): NodeProvider.__init__(self, provider_config, cluster_name) self.ec2 = boto3.resource("ec2", region_name=provider_config["region"])
def __init__(self, provider_config, cluster_name): NodeProvider.__init__(self, provider_config, cluster_name) self.nodes = {}
def __init__(self, provider_config, cluster_name): NodeProvider.__init__(self, provider_config, cluster_name) self.coordinator_address = provider_config["coordinator_address"]
def __init__(self, provider_config, cluster_name): NodeProvider.__init__(self, provider_config, cluster_name) self.cluster_name = cluster_name self.namespace = provider_config["namespace"]