Пример #1
0
    def __init__(self, provider_config, cluster_name):
        NodeProvider.__init__(self, provider_config, cluster_name)
        self.cache_stopped_nodes = provider_config.get("cache_stopped_nodes",
                                                       True)
        aws_credentials = provider_config.get("aws_credentials")

        self.ec2 = make_ec2_client(
            region=provider_config["region"],
            max_retries=BOTO_MAX_RETRIES,
            aws_credentials=aws_credentials)
        self.ec2_fail_fast = make_ec2_client(
            region=provider_config["region"],
            max_retries=0,
            aws_credentials=aws_credentials)

        # Try availability zones round-robin, starting from random offset
        self.subnet_idx = random.randint(0, 100)

        # Tags that we believe to actually be on EC2.
        self.tag_cache = {}
        # Tags that we will soon upload.
        self.tag_cache_pending = defaultdict(dict)
        # Number of threads waiting for a batched tag update.
        self.batch_thread_count = 0
        self.batch_update_done = threading.Event()
        self.batch_update_done.set()
        self.ready_for_new_batch = threading.Event()
        self.ready_for_new_batch.set()
        self.tag_cache_lock = threading.Lock()
        self.count_lock = threading.Lock()

        # Cache of node objects from the last nodes() call. This avoids
        # excessive DescribeInstances requests.
        self.cached_nodes = {}
Пример #2
0
def mock_init(self, provider_config, cluster_name):
    # Adds an attribute to detect if the provider has created the head.
    NodeProvider.__init__(self, provider_config, cluster_name)
    self.cluster_name = cluster_name
    self.namespace = provider_config["namespace"]

    self._head_created = False
Пример #3
0
    def __init__(self, provider_config, cluster_name):
        NodeProvider.__init__(self, provider_config, cluster_name)
        kwargs = {}
        if "subscription_id" in provider_config:
            kwargs["subscription_id"] = provider_config["subscription_id"]
        try:
            self.compute_client = get_client_from_cli_profile(
                client_class=ComputeManagementClient, **kwargs)
            self.network_client = get_client_from_cli_profile(
                client_class=NetworkManagementClient, **kwargs)
            self.resource_client = get_client_from_cli_profile(
                client_class=ResourceManagementClient, **kwargs)
        except CLIError as e:
            if str(e) != "Please run 'az login' to setup account.":
                raise
            else:
                logger.info("CLI profile authentication failed. Trying MSI")

                credentials = MSIAuthentication()
                self.compute_client = ComputeManagementClient(
                    credentials=credentials, **kwargs)
                self.network_client = NetworkManagementClient(
                    credentials=credentials, **kwargs)
                self.resource_client = ResourceManagementClient(
                    credentials=credentials, **kwargs)

        self.lock = RLock()

        # cache node objects
        self.cached_nodes = {}
Пример #4
0
    def __init__(self, provider_config, cluster_name):
        NodeProvider.__init__(self, provider_config, cluster_name)
        self.cache_stopped_nodes = provider_config.get("cache_stopped_nodes",
                                                       True)
        aws_credentials = provider_config.get("aws_credentials")

        self.ec2 = make_ec2_client(region=provider_config["region"],
                                   max_retries=BOTO_MAX_RETRIES,
                                   aws_credentials=aws_credentials)
        self.ec2_fail_fast = make_ec2_client(region=provider_config["region"],
                                             max_retries=0,
                                             aws_credentials=aws_credentials)

        # Try availability zones round-robin, starting from random offset
        self.subnet_idx = random.randint(0, 100)

        self.tag_cache = {}  # Tags that we believe to actually be on EC2.
        self.tag_cache_pending = {}  # Tags that we will soon upload.
        self.tag_cache_lock = threading.Lock()
        self.tag_cache_update_event = threading.Event()
        self.tag_cache_kill_event = threading.Event()
        self.tag_update_thread = threading.Thread(
            target=self._node_tag_update_loop)
        self.tag_update_thread.start()

        # Cache of node objects from the last nodes() call. This avoids
        # excessive DescribeInstances requests.
        self.cached_nodes = {}
Пример #5
0
    def __init__(self, provider_config: dict, cluster_name: str):
        NodeProvider.__init__(self, provider_config, cluster_name)
        self.lock = RLock()
        self._construct_clients()

        # Cache of node objects from the last nodes() call. This avoids
        # excessive DescribeInstances requests.
        self.cached_nodes: Dict[str, GCPNode] = {}
Пример #6
0
    def __init__(self, provider_config, cluster_name):
        NodeProvider.__init__(self, provider_config, cluster_name)

        self.lock = RLock()
        self.compute = discovery.build("compute", "v1")

        # Cache of node objects from the last nodes() call. This avoids
        # excessive DescribeInstances requests.
        self.cached_nodes = {}
Пример #7
0
    def __init__(self, provider_config, cluster_name):
        NodeProvider.__init__(self, provider_config, cluster_name)

        self.lock = RLock()
        self.compute = discovery.build("compute", "v1")

        # Cache of node objects from the last nodes() call. This avoids
        # excessive DescribeInstances requests.
        self.cached_nodes = {}
Пример #8
0
    def __init__(self, provider_config, cluster_name):
        NodeProvider.__init__(self, provider_config, cluster_name)

        self.lock = RLock()
        _, _, self.compute = construct_clients_from_provider_config(
            provider_config)

        # Cache of node objects from the last nodes() call. This avoids
        # excessive DescribeInstances requests.
        self.cached_nodes = {}
Пример #9
0
    def __init__(self, provider_config, cluster_name):
        NodeProvider.__init__(self, provider_config, cluster_name)
        self.ec2 = boto3.resource("ec2", region_name=provider_config["region"])

        # Cache of node objects from the last nodes() call. This avoids
        # excessive DescribeInstances requests.
        self.cached_nodes = {}

        # Cache of ip lookups. We assume IPs never change once assigned.
        self.internal_ip_cache = {}
        self.external_ip_cache = {}
Пример #10
0
    def __init__(self, provider_config, cluster_name):
        NodeProvider.__init__(self, provider_config, cluster_name)

        self.compute = discovery.build("compute", "v1")

        # Cache of node objects from the last nodes() call. This avoids
        # excessive DescribeInstances requests.
        self.cached_nodes = {}

        # Cache of ip lookups. We assume IPs never change once assigned.
        self.internal_ip_cache = {}
        self.external_ip_cache = {}
Пример #11
0
    def __init__(self, provider_config, cluster_name):
        NodeProvider.__init__(self, provider_config, cluster_name)
        self.__cached = {}

        self.__star = Staroid(access_token=provider_config["access_token"],
                              account=provider_config["account"])

        self.__ske = self._get_config_or_env(provider_config, "ske",
                                             "STAROID_SKE")
        self.__ske_region = self._get_config_or_env(provider_config,
                                                    "ske_region",
                                                    "STAROID_SKE_REGION")
Пример #12
0
    def __init__(self, provider_config, cluster_name):
        NodeProvider.__init__(self, provider_config, cluster_name)

        self.lock = RLock()
        gcp_credentials = fetch_gcp_credentials_from_provider_config(
            provider_config)

        self.compute = _create_compute(gcp_credentials)

        # Cache of node objects from the last nodes() call. This avoids
        # excessive DescribeInstances requests.
        self.cached_nodes = {}
Пример #13
0
    def __init__(self, provider_config, cluster_name):
        NodeProvider.__init__(self, provider_config, cluster_name)

        self.compute = discovery.build("compute", "v1")

        # Cache of node objects from the last nodes() call. This avoids
        # excessive DescribeInstances requests.
        self.cached_nodes = {}

        # Cache of ip lookups. We assume IPs never change once assigned.
        self.internal_ip_cache = {}
        self.external_ip_cache = {}
Пример #14
0
    def __init__(self, provider_config, cluster_name):
        NodeProvider.__init__(self, provider_config, cluster_name)
        config = Config(retries=dict(max_attempts=BOTO_MAX_RETRIES))
        self.ec2 = boto3.resource(
            "ec2", region_name=provider_config["region"], config=config)

        # Cache of node objects from the last nodes() call. This avoids
        # excessive DescribeInstances requests.
        self.cached_nodes = {}

        # Cache of ip lookups. We assume IPs never change once assigned.
        self.internal_ip_cache = {}
        self.external_ip_cache = {}
Пример #15
0
    def __init__(self, provider_config, cluster_name):
        NodeProvider.__init__(self, provider_config, cluster_name)
        config = Config(retries={'max_attempts': BOTO_MAX_RETRIES})
        self.ec2 = boto3.resource(
            "ec2", region_name=provider_config["region"], config=config)

        # Try availability zones round-robin, starting from random offset
        self.subnet_idx = random.randint(0, 100)

        # Cache of node objects from the last nodes() call. This avoids
        # excessive DescribeInstances requests.
        self.cached_nodes = {}

        # Cache of ip lookups. We assume IPs never change once assigned.
        self.internal_ip_cache = {}
        self.external_ip_cache = {}
Пример #16
0
    def __init__(self, provider_config, cluster_name):
        NodeProvider.__init__(self, provider_config, cluster_name)
        subscription_id = provider_config["subscription_id"]
        credential = DefaultAzureCredential(
            exclude_shared_token_cache_credential=True)
        self.compute_client = ComputeManagementClient(credential,
                                                      subscription_id)
        self.network_client = NetworkManagementClient(credential,
                                                      subscription_id)
        self.resource_client = ResourceManagementClient(
            credential, subscription_id)

        self.lock = RLock()

        # cache node objects
        self.cached_nodes = {}
Пример #17
0
    def __init__(self, provider_config, cluster_name):
        NodeProvider.__init__(self, provider_config, cluster_name)

        if cluster_name:
            self.state = ClusterState(
                "/tmp/cluster-{}.lock".format(cluster_name),
                "/tmp/cluster-{}.state".format(cluster_name),
                provider_config,
            )
            self.use_coordinator = False
        else:
            # LocalNodeProvider with a coordinator server.
            self.state = OnPremCoordinatorState(
                "/tmp/coordinator.lock", "/tmp/coordinator.state",
                provider_config["list_of_node_ips"])
            self.use_coordinator = True
Пример #18
0
    def __init__(self, provider_config, cluster_name):
        NodeProvider.__init__(self, provider_config, cluster_name)
        config = Config(retries={'max_attempts': BOTO_MAX_RETRIES})
        self.ec2 = boto3.resource("ec2",
                                  region_name=provider_config["region"],
                                  config=config)

        # Try availability zones round-robin, starting from random offset
        self.subnet_idx = random.randint(0, 100)

        # Cache of node objects from the last nodes() call. This avoids
        # excessive DescribeInstances requests.
        self.cached_nodes = {}

        # Cache of ip lookups. We assume IPs never change once assigned.
        self.internal_ip_cache = {}
        self.external_ip_cache = {}
Пример #19
0
 def __init__(self, provider_config, cluster_name):
     NodeProvider.__init__(self, provider_config, cluster_name)
     if "RAY_FAKE_CLUSTER" not in os.environ:
         raise RuntimeError(
             "FakeMultiNodeProvider requires ray to be started with "
             "RAY_FAKE_CLUSTER=1 ray start ...")
     self._nodes = {
         FAKE_HEAD_NODE_ID: {
             "tags": {
                 TAG_RAY_NODE_KIND: NODE_KIND_HEAD,
                 TAG_RAY_USER_NODE_TYPE: FAKE_HEAD_NODE_TYPE,
                 TAG_RAY_NODE_NAME: FAKE_HEAD_NODE_ID,
                 TAG_RAY_NODE_STATUS: STATUS_UP_TO_DATE,
             }
         },
     }
     self._next_node_id = 0
Пример #20
0
    def __init__(self, provider_config, cluster_name):
        NodeProvider.__init__(self, provider_config, cluster_name)

        if cluster_name:
            lock_path = get_lock_path(cluster_name)
            state_path = get_state_path(cluster_name)
            self.state = ClusterState(
                lock_path,
                state_path,
                provider_config,
            )
            self.use_coordinator = False
        else:
            # LocalNodeProvider with a coordinator server.
            self.state = OnPremCoordinatorState(
                "/tmp/coordinator.lock", "/tmp/coordinator.state",
                provider_config["list_of_node_ips"])
            self.use_coordinator = True
Пример #21
0
    def __init__(self, provider_config, cluster_name):
        NodeProvider.__init__(self, provider_config, cluster_name)
        config = Config(retries={"max_attempts": BOTO_MAX_RETRIES})
        self.ec2 = boto3.resource(
            "ec2", region_name=provider_config["region"], config=config)

        # Try availability zones round-robin, starting from random offset
        self.subnet_idx = random.randint(0, 100)

        self.tag_cache = {}  # Tags that we believe to actually be on EC2.
        self.tag_cache_pending = {}  # Tags that we will soon upload.
        self.tag_cache_lock = threading.Lock()
        self.tag_cache_update_event = threading.Event()
        self.tag_cache_kill_event = threading.Event()
        self.tag_update_thread = threading.Thread(
            target=self._node_tag_update_loop)
        self.tag_update_thread.start()

        # Cache of node objects from the last nodes() call. This avoids
        # excessive DescribeInstances requests.
        self.cached_nodes = {}
Пример #22
0
    def __init__(self, provider_config, cluster_name):
        NodeProvider.__init__(self, provider_config, cluster_name)
        kwargs = {}
        if "subscription_id" in provider_config:
            kwargs["subscription_id"] = provider_config["subscription_id"]
        try:
            self.compute_client = get_client_from_cli_profile(
                client_class=ComputeManagementClient, **kwargs)
            self.network_client = get_client_from_cli_profile(
                client_class=NetworkManagementClient, **kwargs)
        except Exception:
            logger.info("CLI profile authentication failed. Trying MSI",
                        exc_info=True)

            credentials = MSIAuthentication()
            self.compute_client = ComputeManagementClient(
                credentials=credentials, **kwargs)
            self.network_client = NetworkManagementClient(
                credentials=credentials, **kwargs)

        self.lock = RLock()

        # cache node objects
        self.cached_nodes = {}
Пример #23
0
 def __init__(self, provider_config, cluster_name):
     NodeProvider.__init__(self, provider_config, cluster_name)
     self.state = ClusterState("/tmp/cluster-{}.lock".format(cluster_name),
                               "/tmp/cluster-{}.state".format(cluster_name),
                               provider_config)
Пример #24
0
 def __init__(self, provider_config, cluster_name):
     NodeProvider.__init__(self, provider_config, cluster_name)
     self.ec2 = boto3.resource("ec2", region_name=provider_config["region"])
Пример #25
0
 def __init__(self, provider_config, cluster_name):
     NodeProvider.__init__(self, provider_config, cluster_name)
     self.cluster_name = cluster_name
     self.namespace = provider_config["namespace"]
Пример #26
0
 def __init__(self, provider_config, cluster_name):
     NodeProvider.__init__(self, provider_config, cluster_name)
     self.coordinator_address = provider_config["coordinator_address"]
Пример #27
0
 def __init__(self, provider_config, cluster_name):
     NodeProvider.__init__(self, provider_config, cluster_name)
     self.state = ClusterState("/tmp/cluster-{}.lock".format(cluster_name),
                               "/tmp/cluster-{}.state".format(cluster_name),
                               provider_config)
Пример #28
0
 def __init__(self, provider_config, cluster_name):
     NodeProvider.__init__(self, provider_config, cluster_name)
     self.nodes = {}