def __init__(self, redis_address, autoscaling_config, redis_password=None, prefix_cluster_info=False, monitor_ip=None, stop_event: Optional[Event] = None): # Initialize the Redis clients. ray.state.state._initialize_global_state(redis_address, redis_password=redis_password) self.redis = ray._private.services.create_redis_client( redis_address, password=redis_password) if monitor_ip: self.redis.set("AutoscalerMetricsAddress", f"{monitor_ip}:{AUTOSCALER_METRIC_PORT}") (ip, port) = redis_address.split(":") self.gcs_client = connect_to_gcs(ip, int(port), redis_password) # Initialize the gcs stub for getting all node resource usage. gcs_address = self.redis.get("GcsServerAddress").decode("utf-8") options = (("grpc.enable_http_proxy", 0), ) gcs_channel = grpc.insecure_channel(gcs_address, options=options) self.gcs_node_resources_stub = \ gcs_service_pb2_grpc.NodeResourceInfoGcsServiceStub(gcs_channel) # Set the redis client and mode so _internal_kv works for autoscaler. worker = ray.worker.global_worker worker.redis_client = self.redis worker.gcs_client = self.gcs_client worker.mode = 0 head_node_ip = redis_address.split(":")[0] self.redis_address = redis_address self.redis_password = redis_password self.load_metrics = LoadMetrics(local_ip=head_node_ip) self.last_avail_resources = None self.event_summarizer = EventSummarizer() self.prefix_cluster_info = prefix_cluster_info # Can be used to signal graceful exit from monitor loop. self.stop_event = stop_event # type: Optional[Event] self.autoscaling_config = autoscaling_config self.autoscaler = None self.prom_metrics = AutoscalerPrometheusMetrics() if monitor_ip: # If monitor_ip wasn't passed in, then don't attempt to start the # metric server to keep behavior identical to before metrics were # introduced try: logger.info( "Starting autoscaler metrics server on port {}".format( AUTOSCALER_METRIC_PORT)) prometheus_client.start_http_server( AUTOSCALER_METRIC_PORT, registry=self.prom_metrics.registry) except Exception: logger.exception( "An exception occurred while starting the metrics server.") logger.info("Monitor: Started")
def __init__(self, provider, queue, pending, prom_metrics=None, node_types=None, index=None, *args, **kwargs): self.queue = queue self.pending = pending self.prom_metrics = prom_metrics or AutoscalerPrometheusMetrics() self.provider = provider self.node_types = node_types self.index = str(index) if index is not None else "" super(NodeLauncher, self).__init__(*args, **kwargs)
def __init__( self, provider, pending, event_summarizer, prom_metrics=None, node_types=None, index=None, *args, **kwargs, ): self.pending = pending self.prom_metrics = prom_metrics or AutoscalerPrometheusMetrics() self.provider = provider self.node_types = node_types self.index = str(index) if index is not None else "" self.event_summarizer = event_summarizer
def __init__(self, redis_address, autoscaling_config, redis_password=None, prefix_cluster_info=False, monitor_ip=None, stop_event: Optional[Event] = None): # Initialize the Redis clients. ray.state.state._initialize_global_state( redis_address, redis_password=redis_password) self.redis = ray._private.services.create_redis_client( redis_address, password=redis_password) if monitor_ip: self.redis.set("AutoscalerMetricsAddress", f"{monitor_ip}:{AUTOSCALER_METRIC_PORT}") (ip, port) = redis_address.split(":") # Initialize the gcs stub for getting all node resource usage. gcs_address = self.redis.get("GcsServerAddress").decode("utf-8") options = (("grpc.enable_http_proxy", 0), ) gcs_channel = ray._private.utils.init_grpc_channel( gcs_address, options) # TODO: Use gcs client for this self.gcs_node_resources_stub = \ gcs_service_pb2_grpc.NodeResourceInfoGcsServiceStub(gcs_channel) self.gcs_node_info_stub = \ gcs_service_pb2_grpc.NodeInfoGcsServiceStub(gcs_channel) # Set the redis client and mode so _internal_kv works for autoscaler. worker = ray.worker.global_worker worker.redis_client = self.redis gcs_client = GcsClient.create_from_redis(self.redis) _initialize_internal_kv(gcs_client) worker.mode = 0 head_node_ip = redis_address.split(":")[0] self.redis_address = redis_address self.redis_password = redis_password if os.environ.get("RAY_FAKE_CLUSTER"): self.load_metrics = LoadMetrics(local_ip=FAKE_HEAD_NODE_ID) else: self.load_metrics = LoadMetrics(local_ip=head_node_ip) self.last_avail_resources = None self.event_summarizer = EventSummarizer() self.prefix_cluster_info = prefix_cluster_info # Can be used to signal graceful exit from monitor loop. self.stop_event = stop_event # type: Optional[Event] self.autoscaling_config = autoscaling_config self.autoscaler = None # If set, we are in a manually created cluster (non-autoscaling) and # simply mirroring what the GCS tells us the cluster node types are. self.readonly_config = None self.prom_metrics = AutoscalerPrometheusMetrics() if monitor_ip and prometheus_client: # If monitor_ip wasn't passed in, then don't attempt to start the # metric server to keep behavior identical to before metrics were # introduced try: logger.info( "Starting autoscaler metrics server on port {}".format( AUTOSCALER_METRIC_PORT)) prometheus_client.start_http_server( port=AUTOSCALER_METRIC_PORT, addr="127.0.0.1" if head_node_ip == "127.0.0.1" else "", registry=self.prom_metrics.registry) except Exception: logger.exception( "An exception occurred while starting the metrics server.") elif not prometheus_client: logger.warning("`prometheus_client` not found, so metrics will " "not be exported.") logger.info("Monitor: Started")
def __init__( self, config_path: str, load_metrics: LoadMetrics, max_launch_batch: int = AUTOSCALER_MAX_LAUNCH_BATCH, max_concurrent_launches: int = AUTOSCALER_MAX_CONCURRENT_LAUNCHES, max_failures: int = AUTOSCALER_MAX_NUM_FAILURES, process_runner: Any = subprocess, update_interval_s: int = AUTOSCALER_UPDATE_INTERVAL_S, prefix_cluster_info: bool = False, event_summarizer: Optional[EventSummarizer] = None, prom_metrics: Optional[AutoscalerPrometheusMetrics] = None): """Create a StandardAutoscaler. Args: config_path: Path to a Ray Autoscaler YAML. load_metrics: Provides metrics for the Ray cluster. max_launch_batch: Max number of nodes to launch in one request. max_concurrent_launches: Max number of nodes that can be concurrently launched. This value and `max_launch_batch` determine the number of batches that are used to launch nodes. max_failures: Number of failures that the autoscaler will tolerate before exiting. process_runner: Subprocess-like interface used by the CommandRunner. update_interval_s: Seconds between running the autoscaling loop. prefix_cluster_info: Whether to add the cluster name to info strings. event_summarizer: Utility to consolidate duplicated messages. prom_metrics: Prometheus metrics for autoscaler-related operations. """ self.config_path = config_path # Prefix each line of info string with cluster name if True self.prefix_cluster_info = prefix_cluster_info # Keep this before self.reset (self.provider needs to be created # exactly once). self.provider = None # Keep this before self.reset (if an exception occurs in reset # then prom_metrics must be instantitiated to increment the # exception counter) self.prom_metrics = prom_metrics or \ AutoscalerPrometheusMetrics() self.resource_demand_scheduler = None self.reset(errors_fatal=True) self.head_node_ip = load_metrics.local_ip self.load_metrics = load_metrics self.max_failures = max_failures self.max_launch_batch = max_launch_batch self.max_concurrent_launches = max_concurrent_launches self.process_runner = process_runner self.event_summarizer = event_summarizer or EventSummarizer() # Map from node_id to NodeUpdater threads self.updaters = {} self.num_failed_updates = defaultdict(int) self.num_successful_updates = defaultdict(int) self.num_failures = 0 self.last_update_time = 0.0 self.update_interval_s = update_interval_s # Tracks active worker nodes self.workers = [] # Tracks nodes scheduled for termination self.nodes_to_terminate = [] # Disable NodeUpdater threads if true. # Should be set to true in situations where another component, such as # a Kubernetes operator, is responsible for Ray setup on nodes. self.disable_node_updaters = self.config["provider"].get( "disable_node_updaters", False) # Node launchers self.launch_queue = queue.Queue() self.pending_launches = ConcurrentCounter() max_batches = math.ceil(max_concurrent_launches / float(max_launch_batch)) for i in range(int(max_batches)): node_launcher = NodeLauncher(provider=self.provider, queue=self.launch_queue, index=i, pending=self.pending_launches, node_types=self.available_node_types, prom_metrics=self.prom_metrics) node_launcher.daemon = True node_launcher.start() # NodeTracker maintains soft state to track the number of recently # failed nodes. It is best effort only. self.node_tracker = NodeTracker() # Expand local file_mounts to allow ~ in the paths. This can't be done # earlier when the config is written since we might be on different # platform and the expansion would result in wrong path. self.config["file_mounts"] = { remote: os.path.expanduser(local) for remote, local in self.config["file_mounts"].items() } for local_path in self.config["file_mounts"].values(): assert os.path.exists(local_path) logger.info("StandardAutoscaler: {}".format(self.config))
def __init__( self, address: str, autoscaling_config: Union[str, Callable[[], Dict[str, Any]]], redis_password: Optional[str] = None, prefix_cluster_info: bool = False, monitor_ip: Optional[str] = None, stop_event: Optional[Event] = None, retry_on_failure: bool = True, ): gcs_address = address options = (("grpc.enable_http_proxy", 0), ) gcs_channel = ray._private.utils.init_grpc_channel( gcs_address, options) # TODO: Use gcs client for this self.gcs_node_resources_stub = ( gcs_service_pb2_grpc.NodeResourceInfoGcsServiceStub(gcs_channel)) self.gcs_node_info_stub = gcs_service_pb2_grpc.NodeInfoGcsServiceStub( gcs_channel) if redis_password is not None: logger.warning("redis_password has been deprecated.") # Set the redis client and mode so _internal_kv works for autoscaler. worker = ray.worker.global_worker gcs_client = GcsClient(address=gcs_address) if monitor_ip: monitor_addr = f"{monitor_ip}:{AUTOSCALER_METRIC_PORT}" gcs_client.internal_kv_put(b"AutoscalerMetricsAddress", monitor_addr.encode(), True, None) _initialize_internal_kv(gcs_client) if monitor_ip: monitor_addr = f"{monitor_ip}:{AUTOSCALER_METRIC_PORT}" gcs_client.internal_kv_put(b"AutoscalerMetricsAddress", monitor_addr.encode(), True, None) worker.mode = 0 head_node_ip = gcs_address.split(":")[0] self.load_metrics = LoadMetrics() self.last_avail_resources = None self.event_summarizer = EventSummarizer() self.prefix_cluster_info = prefix_cluster_info # Can be used to signal graceful exit from monitor loop. self.stop_event = stop_event # type: Optional[Event] self.retry_on_failure = retry_on_failure self.autoscaling_config = autoscaling_config self.autoscaler = None # If set, we are in a manually created cluster (non-autoscaling) and # simply mirroring what the GCS tells us the cluster node types are. self.readonly_config = None self.prom_metrics = AutoscalerPrometheusMetrics() if monitor_ip and prometheus_client: # If monitor_ip wasn't passed in, then don't attempt to start the # metric server to keep behavior identical to before metrics were # introduced try: logger.info( "Starting autoscaler metrics server on port {}".format( AUTOSCALER_METRIC_PORT)) prometheus_client.start_http_server( port=AUTOSCALER_METRIC_PORT, addr="127.0.0.1" if head_node_ip == "127.0.0.1" else "", registry=self.prom_metrics.registry, ) except Exception: logger.exception( "An exception occurred while starting the metrics server.") elif not prometheus_client: logger.warning( "`prometheus_client` not found, so metrics will not be exported." ) logger.info("Monitor: Started")
def __init__(self, config_path, load_metrics, max_launch_batch=AUTOSCALER_MAX_LAUNCH_BATCH, max_concurrent_launches=AUTOSCALER_MAX_CONCURRENT_LAUNCHES, max_failures=AUTOSCALER_MAX_NUM_FAILURES, process_runner=subprocess, update_interval_s=AUTOSCALER_UPDATE_INTERVAL_S, prefix_cluster_info=False, event_summarizer=None, prom_metrics=None): self.config_path = config_path # Prefix each line of info string with cluster name if True self.prefix_cluster_info = prefix_cluster_info # Keep this before self.reset (self.provider needs to be created # exactly once). self.provider = None # Keep this before self.reset (if an exception occurs in reset # then prom_metrics must be instantitiated to increment the # exception counter) self.prom_metrics = prom_metrics or \ AutoscalerPrometheusMetrics() self.resource_demand_scheduler = None self.reset(errors_fatal=True) self.head_node_ip = load_metrics.local_ip self.load_metrics = load_metrics self.max_failures = max_failures self.max_launch_batch = max_launch_batch self.max_concurrent_launches = max_concurrent_launches self.process_runner = process_runner self.event_summarizer = event_summarizer or EventSummarizer() # Map from node_id to NodeUpdater processes self.updaters = {} self.num_failed_updates = defaultdict(int) self.num_successful_updates = defaultdict(int) self.num_failures = 0 self.last_update_time = 0.0 self.update_interval_s = update_interval_s # Node launchers self.launch_queue = queue.Queue() self.pending_launches = ConcurrentCounter() max_batches = math.ceil(max_concurrent_launches / float(max_launch_batch)) for i in range(int(max_batches)): node_launcher = NodeLauncher(provider=self.provider, queue=self.launch_queue, index=i, pending=self.pending_launches, node_types=self.available_node_types, prom_metrics=self.prom_metrics) node_launcher.daemon = True node_launcher.start() # NodeTracker maintains soft state to track the number of recently # failed nodes. It is best effort only. self.node_tracker = NodeTracker() # Expand local file_mounts to allow ~ in the paths. This can't be done # earlier when the config is written since we might be on different # platform and the expansion would result in wrong path. self.config["file_mounts"] = { remote: os.path.expanduser(local) for remote, local in self.config["file_mounts"].items() } for local_path in self.config["file_mounts"].values(): assert os.path.exists(local_path) logger.info("StandardAutoscaler: {}".format(self.config))