Пример #1
0
    def __init__(self, provider_config, cluster_name):
        NodeProvider.__init__(self, provider_config, cluster_name)
        self.cache_stopped_nodes = provider_config.get("cache_stopped_nodes",
                                                       True)
        aws_credentials = provider_config.get("aws_credentials")

        self.ec2 = make_ec2_client(
            region=provider_config["region"],
            max_retries=BOTO_MAX_RETRIES,
            aws_credentials=aws_credentials)
        self.ec2_fail_fast = make_ec2_client(
            region=provider_config["region"],
            max_retries=0,
            aws_credentials=aws_credentials)

        # Try availability zones round-robin, starting from random offset
        self.subnet_idx = random.randint(0, 100)

        # Tags that we believe to actually be on EC2.
        self.tag_cache = {}
        # Tags that we will soon upload.
        self.tag_cache_pending = defaultdict(dict)
        # Number of threads waiting for a batched tag update.
        self.batch_thread_count = 0
        self.batch_update_done = threading.Event()
        self.batch_update_done.set()
        self.ready_for_new_batch = threading.Event()
        self.ready_for_new_batch.set()
        self.tag_cache_lock = threading.Lock()
        self.count_lock = threading.Lock()

        # Cache of node objects from the last nodes() call. This avoids
        # excessive DescribeInstances requests.
        self.cached_nodes = {}
Пример #2
0
def mock_init(self, provider_config, cluster_name):
    # Adds an attribute to detect if the provider has created the head.
    NodeProvider.__init__(self, provider_config, cluster_name)
    self.cluster_name = cluster_name
    self.namespace = provider_config["namespace"]

    self._head_created = False
Пример #3
0
    def __init__(self, provider_config, cluster_name):
        NodeProvider.__init__(self, provider_config, cluster_name)
        self.cache_stopped_nodes = provider_config.get("cache_stopped_nodes",
                                                       True)
        aws_credentials = provider_config.get("aws_credentials")

        self.ec2 = make_ec2_client(region=provider_config["region"],
                                   max_retries=BOTO_MAX_RETRIES,
                                   aws_credentials=aws_credentials)
        self.ec2_fail_fast = make_ec2_client(region=provider_config["region"],
                                             max_retries=0,
                                             aws_credentials=aws_credentials)

        # Try availability zones round-robin, starting from random offset
        self.subnet_idx = random.randint(0, 100)

        self.tag_cache = {}  # Tags that we believe to actually be on EC2.
        self.tag_cache_pending = {}  # Tags that we will soon upload.
        self.tag_cache_lock = threading.Lock()
        self.tag_cache_update_event = threading.Event()
        self.tag_cache_kill_event = threading.Event()
        self.tag_update_thread = threading.Thread(
            target=self._node_tag_update_loop)
        self.tag_update_thread.start()

        # Cache of node objects from the last nodes() call. This avoids
        # excessive DescribeInstances requests.
        self.cached_nodes = {}
Пример #4
0
    def __init__(self, provider_config, cluster_name):
        NodeProvider.__init__(self, provider_config, cluster_name)
        kwargs = {}
        if "subscription_id" in provider_config:
            kwargs["subscription_id"] = provider_config["subscription_id"]
        try:
            self.compute_client = get_client_from_cli_profile(
                client_class=ComputeManagementClient, **kwargs)
            self.network_client = get_client_from_cli_profile(
                client_class=NetworkManagementClient, **kwargs)
            self.resource_client = get_client_from_cli_profile(
                client_class=ResourceManagementClient, **kwargs)
        except CLIError as e:
            if str(e) != "Please run 'az login' to setup account.":
                raise
            else:
                logger.info("CLI profile authentication failed. Trying MSI")

                credentials = MSIAuthentication()
                self.compute_client = ComputeManagementClient(
                    credentials=credentials, **kwargs)
                self.network_client = NetworkManagementClient(
                    credentials=credentials, **kwargs)
                self.resource_client = ResourceManagementClient(
                    credentials=credentials, **kwargs)

        self.lock = RLock()

        # cache node objects
        self.cached_nodes = {}
Пример #5
0
    def __init__(self, provider_config: dict, cluster_name: str):
        NodeProvider.__init__(self, provider_config, cluster_name)
        self.lock = RLock()
        self._construct_clients()

        # Cache of node objects from the last nodes() call. This avoids
        # excessive DescribeInstances requests.
        self.cached_nodes: Dict[str, GCPNode] = {}
Пример #6
0
    def __init__(self, provider_config, cluster_name):
        NodeProvider.__init__(self, provider_config, cluster_name)

        self.lock = RLock()
        self.compute = discovery.build("compute", "v1")

        # Cache of node objects from the last nodes() call. This avoids
        # excessive DescribeInstances requests.
        self.cached_nodes = {}
Пример #7
0
    def __init__(self, provider_config, cluster_name):
        NodeProvider.__init__(self, provider_config, cluster_name)

        self.lock = RLock()
        self.compute = discovery.build("compute", "v1")

        # Cache of node objects from the last nodes() call. This avoids
        # excessive DescribeInstances requests.
        self.cached_nodes = {}
Пример #8
0
    def __init__(self, provider_config, cluster_name):
        NodeProvider.__init__(self, provider_config, cluster_name)

        self.lock = RLock()
        _, _, self.compute = construct_clients_from_provider_config(
            provider_config)

        # Cache of node objects from the last nodes() call. This avoids
        # excessive DescribeInstances requests.
        self.cached_nodes = {}
Пример #9
0
    def __init__(self, provider_config, cluster_name):
        NodeProvider.__init__(self, provider_config, cluster_name)
        self.ec2 = boto3.resource("ec2", region_name=provider_config["region"])

        # Cache of node objects from the last nodes() call. This avoids
        # excessive DescribeInstances requests.
        self.cached_nodes = {}

        # Cache of ip lookups. We assume IPs never change once assigned.
        self.internal_ip_cache = {}
        self.external_ip_cache = {}
Пример #10
0
    def __init__(self, provider_config, cluster_name):
        NodeProvider.__init__(self, provider_config, cluster_name)

        self.compute = discovery.build("compute", "v1")

        # Cache of node objects from the last nodes() call. This avoids
        # excessive DescribeInstances requests.
        self.cached_nodes = {}

        # Cache of ip lookups. We assume IPs never change once assigned.
        self.internal_ip_cache = {}
        self.external_ip_cache = {}
Пример #11
0
    def __init__(self, provider_config, cluster_name):
        NodeProvider.__init__(self, provider_config, cluster_name)

        self.lock = RLock()
        gcp_credentials = fetch_gcp_credentials_from_provider_config(
            provider_config)

        self.compute = _create_compute(gcp_credentials)

        # Cache of node objects from the last nodes() call. This avoids
        # excessive DescribeInstances requests.
        self.cached_nodes = {}
Пример #12
0
    def __init__(self, provider_config, cluster_name):
        NodeProvider.__init__(self, provider_config, cluster_name)

        self.compute = discovery.build("compute", "v1")

        # Cache of node objects from the last nodes() call. This avoids
        # excessive DescribeInstances requests.
        self.cached_nodes = {}

        # Cache of ip lookups. We assume IPs never change once assigned.
        self.internal_ip_cache = {}
        self.external_ip_cache = {}
Пример #13
0
    def __init__(self, provider_config, cluster_name):
        NodeProvider.__init__(self, provider_config, cluster_name)
        self.__cached = {}

        self.__star = Staroid(access_token=provider_config["access_token"],
                              account=provider_config["account"])

        self.__ske = self._get_config_or_env(provider_config, "ske",
                                             "STAROID_SKE")
        self.__ske_region = self._get_config_or_env(provider_config,
                                                    "ske_region",
                                                    "STAROID_SKE_REGION")
Пример #14
0
    def __init__(self, provider_config, cluster_name):
        NodeProvider.__init__(self, provider_config, cluster_name)
        config = Config(retries=dict(max_attempts=BOTO_MAX_RETRIES))
        self.ec2 = boto3.resource(
            "ec2", region_name=provider_config["region"], config=config)

        # Cache of node objects from the last nodes() call. This avoids
        # excessive DescribeInstances requests.
        self.cached_nodes = {}

        # Cache of ip lookups. We assume IPs never change once assigned.
        self.internal_ip_cache = {}
        self.external_ip_cache = {}
Пример #15
0
    def __init__(self, provider_config, cluster_name):
        NodeProvider.__init__(self, provider_config, cluster_name)

        if cluster_name:
            self.state = ClusterState(
                "/tmp/cluster-{}.lock".format(cluster_name),
                "/tmp/cluster-{}.state".format(cluster_name),
                provider_config,
            )
            self.use_coordinator = False
        else:
            # LocalNodeProvider with a coordinator server.
            self.state = OnPremCoordinatorState(
                "/tmp/coordinator.lock", "/tmp/coordinator.state",
                provider_config["list_of_node_ips"])
            self.use_coordinator = True
Пример #16
0
    def __init__(self, provider_config, cluster_name):
        NodeProvider.__init__(self, provider_config, cluster_name)
        config = Config(retries={'max_attempts': BOTO_MAX_RETRIES})
        self.ec2 = boto3.resource(
            "ec2", region_name=provider_config["region"], config=config)

        # Try availability zones round-robin, starting from random offset
        self.subnet_idx = random.randint(0, 100)

        # Cache of node objects from the last nodes() call. This avoids
        # excessive DescribeInstances requests.
        self.cached_nodes = {}

        # Cache of ip lookups. We assume IPs never change once assigned.
        self.internal_ip_cache = {}
        self.external_ip_cache = {}
Пример #17
0
    def __init__(self, provider_config, cluster_name):
        NodeProvider.__init__(self, provider_config, cluster_name)
        subscription_id = provider_config["subscription_id"]
        credential = DefaultAzureCredential(
            exclude_shared_token_cache_credential=True)
        self.compute_client = ComputeManagementClient(credential,
                                                      subscription_id)
        self.network_client = NetworkManagementClient(credential,
                                                      subscription_id)
        self.resource_client = ResourceManagementClient(
            credential, subscription_id)

        self.lock = RLock()

        # cache node objects
        self.cached_nodes = {}
Пример #18
0
    def __init__(self, provider_config, cluster_name):
        NodeProvider.__init__(self, provider_config, cluster_name)
        config = Config(retries={'max_attempts': BOTO_MAX_RETRIES})
        self.ec2 = boto3.resource("ec2",
                                  region_name=provider_config["region"],
                                  config=config)

        # Try availability zones round-robin, starting from random offset
        self.subnet_idx = random.randint(0, 100)

        # Cache of node objects from the last nodes() call. This avoids
        # excessive DescribeInstances requests.
        self.cached_nodes = {}

        # Cache of ip lookups. We assume IPs never change once assigned.
        self.internal_ip_cache = {}
        self.external_ip_cache = {}
Пример #19
0
 def __init__(self, provider_config, cluster_name):
     NodeProvider.__init__(self, provider_config, cluster_name)
     if "RAY_FAKE_CLUSTER" not in os.environ:
         raise RuntimeError(
             "FakeMultiNodeProvider requires ray to be started with "
             "RAY_FAKE_CLUSTER=1 ray start ...")
     self._nodes = {
         FAKE_HEAD_NODE_ID: {
             "tags": {
                 TAG_RAY_NODE_KIND: NODE_KIND_HEAD,
                 TAG_RAY_USER_NODE_TYPE: FAKE_HEAD_NODE_TYPE,
                 TAG_RAY_NODE_NAME: FAKE_HEAD_NODE_ID,
                 TAG_RAY_NODE_STATUS: STATUS_UP_TO_DATE,
             }
         },
     }
     self._next_node_id = 0
Пример #20
0
    def __init__(self, provider_config, cluster_name):
        NodeProvider.__init__(self, provider_config, cluster_name)

        if cluster_name:
            lock_path = get_lock_path(cluster_name)
            state_path = get_state_path(cluster_name)
            self.state = ClusterState(
                lock_path,
                state_path,
                provider_config,
            )
            self.use_coordinator = False
        else:
            # LocalNodeProvider with a coordinator server.
            self.state = OnPremCoordinatorState(
                "/tmp/coordinator.lock", "/tmp/coordinator.state",
                provider_config["list_of_node_ips"])
            self.use_coordinator = True
Пример #21
0
    def __init__(self, provider_config, cluster_name):
        NodeProvider.__init__(self, provider_config, cluster_name)
        config = Config(retries={"max_attempts": BOTO_MAX_RETRIES})
        self.ec2 = boto3.resource(
            "ec2", region_name=provider_config["region"], config=config)

        # Try availability zones round-robin, starting from random offset
        self.subnet_idx = random.randint(0, 100)

        self.tag_cache = {}  # Tags that we believe to actually be on EC2.
        self.tag_cache_pending = {}  # Tags that we will soon upload.
        self.tag_cache_lock = threading.Lock()
        self.tag_cache_update_event = threading.Event()
        self.tag_cache_kill_event = threading.Event()
        self.tag_update_thread = threading.Thread(
            target=self._node_tag_update_loop)
        self.tag_update_thread.start()

        # Cache of node objects from the last nodes() call. This avoids
        # excessive DescribeInstances requests.
        self.cached_nodes = {}
Пример #22
0
    def __init__(self, provider_config, cluster_name):
        NodeProvider.__init__(self, provider_config, cluster_name)
        kwargs = {}
        if "subscription_id" in provider_config:
            kwargs["subscription_id"] = provider_config["subscription_id"]
        try:
            self.compute_client = get_client_from_cli_profile(
                client_class=ComputeManagementClient, **kwargs)
            self.network_client = get_client_from_cli_profile(
                client_class=NetworkManagementClient, **kwargs)
        except Exception:
            logger.info("CLI profile authentication failed. Trying MSI",
                        exc_info=True)

            credentials = MSIAuthentication()
            self.compute_client = ComputeManagementClient(
                credentials=credentials, **kwargs)
            self.network_client = NetworkManagementClient(
                credentials=credentials, **kwargs)

        self.lock = RLock()

        # cache node objects
        self.cached_nodes = {}
Пример #23
0
def _set_up_config_for_head_node(config: Dict[str, Any],
                                 provider: NodeProvider,
                                 no_restart: bool) ->\
        Tuple[Dict[str, Any], Any]:
    """Prepares autoscaling config and, if needed, ssh key, to be mounted onto
    the Ray head node for use by the autoscaler.

    Returns the modified config and the temporary config file that will be
    mounted onto the head node.
    """
    # Rewrite the auth config so that the head
    # node can update the workers
    remote_config = copy.deepcopy(config)

    # drop proxy options if they exist, otherwise
    # head node won't be able to connect to workers
    remote_config["auth"].pop("ssh_proxy_command", None)

    if "ssh_private_key" in config["auth"]:
        remote_key_path = "~/ray_bootstrap_key.pem"
        remote_config["auth"]["ssh_private_key"] = remote_key_path

    # Adjust for new file locations
    new_mounts = {}
    for remote_path in config["file_mounts"]:
        new_mounts[remote_path] = remote_path
    remote_config["file_mounts"] = new_mounts
    remote_config["no_restart"] = no_restart

    remote_config = provider.prepare_for_head_node(remote_config)

    # Now inject the rewritten config and SSH key into the head node
    remote_config_file = tempfile.NamedTemporaryFile(
        "w", prefix="ray-bootstrap-")
    remote_config_file.write(json.dumps(remote_config))
    remote_config_file.flush()
    config["file_mounts"].update({
        "~/ray_bootstrap_config.yaml": remote_config_file.name
    })

    if "ssh_private_key" in config["auth"]:
        config["file_mounts"].update({
            remote_key_path: config["auth"]["ssh_private_key"],
        })

    return config, remote_config_file
Пример #24
0
 def __init__(self, provider_config, cluster_name):
     NodeProvider.__init__(self, provider_config, cluster_name)
     self.state = ClusterState("/tmp/cluster-{}.lock".format(cluster_name),
                               "/tmp/cluster-{}.state".format(cluster_name),
                               provider_config)
Пример #25
0
 def __init__(self, provider_config, cluster_name):
     NodeProvider.__init__(self, provider_config, cluster_name)
     self.ec2 = boto3.resource("ec2", region_name=provider_config["region"])
Пример #26
0
 def __init__(self, provider_config, cluster_name):
     NodeProvider.__init__(self, provider_config, cluster_name)
     self.nodes = {}
Пример #27
0
 def __init__(self, provider_config, cluster_name):
     NodeProvider.__init__(self, provider_config, cluster_name)
     self.state = ClusterState("/tmp/cluster-{}.lock".format(cluster_name),
                               "/tmp/cluster-{}.state".format(cluster_name),
                               provider_config)
Пример #28
0
 def __init__(self, provider_config, cluster_name):
     NodeProvider.__init__(self, provider_config, cluster_name)
     self.coordinator_address = provider_config["coordinator_address"]
Пример #29
0
 def __init__(self, provider_config, cluster_name):
     NodeProvider.__init__(self, provider_config, cluster_name)
     self.cluster_name = cluster_name
     self.namespace = provider_config["namespace"]