def test_ray_debugger_public_multi_node(shutdown_only, ray_debugger_external): c = Cluster( initialize_head=True, connect=True, head_node_args={ "num_cpus": 0, "num_gpus": 1, "ray_debugger_external": ray_debugger_external, }, ) c.add_node(num_cpus=1, ray_debugger_external=ray_debugger_external) @ray.remote def f(): ray.util.pdb.set_trace() return 1 # num_gpus=1 forces the task onto the head node. head_node_result = f.options(num_cpus=0, num_gpus=1).remote() # num_cpus=1 forces the task onto the worker node. worker_node_result = f.options(num_cpus=1).remote() wait_for_condition(lambda: len( ray.experimental.internal_kv._internal_kv_list( "RAY_PDB_", namespace=ray_constants.KV_NAMESPACE_PDB)) == 2) active_sessions = ray.experimental.internal_kv._internal_kv_list( "RAY_PDB_", namespace=ray_constants.KV_NAMESPACE_PDB) assert len(active_sessions) == 2 session1 = json.loads( ray.experimental.internal_kv._internal_kv_get( active_sessions[0], namespace=ray_constants.KV_NAMESPACE_PDB)) session2 = json.loads( ray.experimental.internal_kv._internal_kv_get( active_sessions[1], namespace=ray_constants.KV_NAMESPACE_PDB)) host1, port1 = session1["pdb_address"].split(":") if ray_debugger_external: assert host1 == services.get_node_ip_address(), host1 else: assert host1 == "localhost", host1 host2, port2 = session2["pdb_address"].split(":") if ray_debugger_external: assert host2 == services.get_node_ip_address(), host2 else: assert host2 == "localhost", host2 # Check that we can successfully connect to both breakpoints. tn1 = Telnet(host1, int(port1)) tn1.write(b"c\n") tn2 = Telnet(host2, int(port2)) tn2.write(b"c\n") # The messages above should cause these to return now. ray.get([head_node_result, worker_node_result])
def with_head_node_ip(cmds, head_ip=None): if head_ip is None: head_ip = services.get_node_ip_address() out = [] for cmd in cmds: out.append("export RAY_HEAD_IP={}; {}".format(head_ip, cmd)) return out
def run_kuberay_autoscaler(cluster_name: str, cluster_namespace: str): """Wait until the Ray head container is ready. Then start the autoscaler.""" _setup_logging() head_ip = get_node_ip_address() ray_address = f"{head_ip}:6379" while True: try: subprocess.check_call( ["ray", "health-check", "--address", ray_address]) logger.info("The Ray head is ready. Starting the autoscaler.") break except subprocess.CalledProcessError: logger.warning("The Ray head is not yet ready.") logger.warning(f"Will check again in {BACKOFF_S} seconds.") time.sleep(BACKOFF_S) # autoscaling_config_producer reads the RayCluster CR from K8s and uses the CR # to output an autoscaling config. autoscaling_config_producer = AutoscalingConfigProducer( cluster_name, cluster_namespace) Monitor( address=ray_address, # The `autoscaling_config` arg can be a dict or a `Callable: () -> dict`. # In this case, it's a callable. autoscaling_config=autoscaling_config_producer, monitor_ip=head_ip, ).run()
def __init__(self, logs_dir, gcs_address): """Initialize the log monitor object.""" self.ip = services.get_node_ip_address() self.logs_dir = logs_dir self.publisher = gcs_pubsub.GcsPublisher(address=gcs_address) self.log_filenames = set() self.open_file_infos = [] self.closed_file_infos = [] self.can_open_more_files = True
def __init__(self, local_ip=None): self.last_used_time_by_ip = {} self.last_heartbeat_time_by_ip = {} self.static_resources_by_ip = {} self.dynamic_resources_by_ip = {} self.resource_load_by_ip = {} self.local_ip = services.get_node_ip_address( ) if local_ip is None else local_ip self.waiting_bundles = [] self.infeasible_bundles = []
def _start_ray_node(self, command, tag): modified_env = self._prepare_env() print("Starting {} by running: {}".format(tag, command)) process_info = session_execute(command=command, env=modified_env, tag=tag) JVMGuard.register_pids(process_info.pids) import ray._private.services as rservices process_info.node_ip = rservices.get_node_ip_address() return process_info
def __init__(self, logs_dir, redis_address, redis_password=None): """Initialize the log monitor object.""" self.ip = services.get_node_ip_address() self.logs_dir = logs_dir self.redis_client = ray._private.services.create_redis_client( redis_address, password=redis_password) self.log_filenames = set() self.open_file_infos = [] self.closed_file_infos = [] self.can_open_more_files = True
def _start_ray_node(self, command, tag): modified_env = self._prepare_env() print("Starting {} by running: {}".format(tag, command)) process_info = session_execute(command=command, env=modified_env, tag=tag) spark_executor_pid = RayServiceFuncGenerator._get_spark_executor_pid() RayServiceFuncGenerator.start_ray_daemon(self.python_loc, pid_to_watch=spark_executor_pid, pgid_to_kill=process_info.pgid) import ray._private.services as rservices process_info.node_ip = rservices.get_node_ip_address() return process_info
def __init__(self, local_ip=None): self.last_used_time_by_ip = {} self.last_heartbeat_time_by_ip = {} self.static_resources_by_ip = {} self.dynamic_resources_by_ip = {} self.resource_load_by_ip = {} self.local_ip = services.get_node_ip_address( ) if local_ip is None else local_ip self.waiting_bundles = [] self.infeasible_bundles = [] self.pending_placement_groups = [] self.resource_requests = [] self.cluster_full_of_actors_detected = False
async def _run_app(self): port = 8080 app = web.Application() app.add_routes([ web.get('/discover/{namespace}/{name}/{group}', self._handle_discover), web.put('/hints/{namespace}/{name}', self._handle_report), ]) self._runner = web.AppRunner(app) await self._runner.setup() site = web.TCPSite(self._runner, services.get_node_ip_address(), port) await site.start() self._ready.set() return None
def __init__(self, logs_dir, redis_address, redis_password=None): """Initialize the log monitor object.""" self.ip = services.get_node_ip_address() self.logs_dir = logs_dir self.redis_client = ray._private.services.create_redis_client( redis_address, password=redis_password) self.publisher = None if gcs_pubsub.gcs_pubsub_enabled(): gcs_addr = gcs_utils.get_gcs_address_from_redis(self.redis_client) self.publisher = gcs_pubsub.GcsPublisher(address=gcs_addr) self.log_filenames = set() self.open_file_infos = [] self.closed_file_infos = [] self.can_open_more_files = True
def _shutdown_per_node(iter): print("Stopping pgids: {}".format(pgids)) if node_ips: current_node_ip = rservices.get_node_ip_address() effect_pgids = [ pair[0] for pair in zip(pgids, node_ips) if pair[1] == current_node_ip ] else: effect_pgids = pgids for pgid in effect_pgids: print("Stopping by pgid {}".format(pgid)) try: os.killpg(pgid, signal.SIGTERM) except Exception: print("WARNING: cannot kill pgid: {}".format(pgid))
def test_ray_debugger_public(shutdown_only, call_ray_stop_only, ray_debugger_external): redis_substring_prefix = "--address='" cmd = ["ray", "start", "--head", "--num-cpus=1"] if ray_debugger_external: cmd.append("--ray-debugger-external") out = ray._private.utils.decode( subprocess.check_output(cmd, stderr=subprocess.STDOUT)) # Get the redis address from the output. redis_substring_prefix = "--address='" address_location = (out.find(redis_substring_prefix) + len(redis_substring_prefix)) address = out[address_location:] address = address.split("'")[0] ray.init(address=address) @ray.remote def f(): ray.util.pdb.set_trace() return 1 result = f.remote() wait_for_condition(lambda: len( ray.experimental.internal_kv._internal_kv_list( "RAY_PDB_", namespace=ray_constants.KV_NAMESPACE_PDB)) > 0) active_sessions = ray.experimental.internal_kv._internal_kv_list( "RAY_PDB_", namespace=ray_constants.KV_NAMESPACE_PDB) assert len(active_sessions) == 1 session = json.loads( ray.experimental.internal_kv._internal_kv_get( active_sessions[0], namespace=ray_constants.KV_NAMESPACE_PDB)) host, port = session["pdb_address"].split(":") if ray_debugger_external: assert host == services.get_node_ip_address(), host else: assert host == "localhost", host # Check that we can successfully connect to both breakpoints. tn = Telnet(host, int(port)) tn.write(b"c\n") # The message above should cause this to return now. ray.get(result)
def _run_autoscaler(cluster_name: str, cluster_namespace: str, redis_password: str = ""): _setup_logging() head_ip = get_node_ip_address() autoscaling_config_producer = AutoscalingConfigProducer( cluster_name, cluster_namespace) Monitor( address=f"{head_ip}:6379", redis_password=redis_password, # The `autoscaling_config` arg can be a dict or a `Callable: () -> dict`. # In this case, it's a callable. autoscaling_config=autoscaling_config_producer, monitor_ip=head_ip, ).run()
def _process_events(self): failed_trial = self.trial_executor.get_next_failed_trial() if failed_trial: error_msg = ( "{} (IP: {}) detected as stale. This is likely because the " "node was lost").format(failed_trial, failed_trial.node_ip) logger.info(error_msg) with warn_if_slow("process_failed_trial"): self._process_trial_failure(failed_trial, error_msg=error_msg) else: # TODO(ujvl): Consider combining get_next_available_trial and # fetch_result functionality so that we don't timeout on fetch. trial = self.trial_executor.get_next_available_trial() # blocking if trial.is_restoring: with warn_if_slow("process_trial_restore"): self._process_trial_restore(trial) with warn_if_slow("callbacks.on_trial_restore"): self._callbacks.on_trial_restore(iteration=self._iteration, trials=self._trials, trial=trial) elif trial.is_saving: with warn_if_slow("process_trial_save") as profile: self._process_trial_save(trial) with warn_if_slow("callbacks.on_trial_save"): self._callbacks.on_trial_save(iteration=self._iteration, trials=self._trials, trial=trial) if profile.too_slow and trial.sync_on_checkpoint: # TODO(ujvl): Suggest using DurableTrainable once # API has converged. msg = ( "Consider turning off forced head-worker trial " "checkpoint syncs by setting sync_on_checkpoint=False" ". Note that this may result in faulty trial " "restoration if a failure occurs while the checkpoint " "is being synced from the worker to the head node.") if trial.location.hostname and (trial.location.hostname != get_node_ip_address()): if log_once("tune_head_worker_checkpoint"): logger.warning(msg) else: with warn_if_slow("process_trial"): self._process_trial(trial)
def __init__( self, logs_dir, gcs_publisher: gcs_pubsub.GcsPublisher, is_proc_alive_fn: Callable[[int], bool], max_files_open: int = ray_constants.LOG_MONITOR_MAX_OPEN_FILES, ): """Initialize the log monitor object.""" self.ip: str = services.get_node_ip_address() self.logs_dir: str = logs_dir self.publisher = gcs_publisher self.log_filenames: Set[str] = set() self.open_file_infos: List[LogFileInfo] = [] self.closed_file_infos: List[LogFileInfo] = [] self.can_open_more_files: bool = True self.max_files_open: int = max_files_open self.is_proc_alive_fn: Callable[[int], bool] = is_proc_alive_fn
def run_kuberay_autoscaler(cluster_name: str, cluster_namespace: str): """Wait until the Ray head container is ready. Then start the autoscaler.""" head_ip = get_node_ip_address() ray_address = f"{head_ip}:6379" while True: try: # Autoscaler Ray version might not exactly match GCS version, so skip the # version check when checking GCS status. subprocess.check_call( [ "ray", "health-check", "--address", ray_address, "--skip-version-check", ] ) # Logging is not ready yet. Print to stdout for now. print("The Ray head is ready. Starting the autoscaler.") break except subprocess.CalledProcessError: print("The Ray head is not yet ready.") print(f"Will check again in {BACKOFF_S} seconds.") time.sleep(BACKOFF_S) # The Ray head container sets up the log directory. Thus, we set up logging # only after the Ray head is ready. _setup_logging() # autoscaling_config_producer reads the RayCluster CR from K8s and uses the CR # to output an autoscaling config. autoscaling_config_producer = AutoscalingConfigProducer( cluster_name, cluster_namespace ) Monitor( address=ray_address, # The `autoscaling_config` arg can be a dict or a `Callable: () -> dict`. # In this case, it's a callable. autoscaling_config=autoscaling_config_producer, monitor_ip=head_ip, # Let the autoscaler process exit after it hits 5 exceptions. # (See ray.autoscaler._private.constants.AUTOSCALER_MAX_NUM_FAILURES.) # Kubernetes will then restart the autoscaler container. retry_on_failure=False, ).run()
def listen_for_spot_termination(timeout=None): MOCK = (os.environ.get("MOCK", "False").lower() == "true") logging.basicConfig(level=logging.INFO) if MOCK: logging.debug("Using mocked spot instance") endpoint = f"{services.get_node_ip_address()}:8234" else: # AWS spot instance termination endpoint endpoint = "169.254.169.254" start = time.time() while True: try: resp = requests.get( f'http://{endpoint}/latest/meta-data/spot/instance-action', timeout=0.1) if resp.status_code == 404: # AWS endpoint responded, no termination detected time.sleep(5) elif resp.status_code >= 200 and resp.status_code < 300: resp_json = resp.json() if (resp_json["action"] == "terminate" or resp_json["action"] == "stop"): ip = services.get_node_ip_address() logging.info(f"termination detected on node {ip}") return ip else: raise RuntimeError( "AWS spot instance interrupt warning " "endpoint not responding") if timeout and time.time() - start > timeout: return None except requests.RequestException as e: logging.error(e) time.sleep(5)
def run_adaptdl(job_key, job_uid, rank, replicas, num_restarts, checkpoint=None, offset=0, path="", argv=None): logging.basicConfig(level=logging.INFO) logging.info(f"Starting worker {rank}") def report_status(status): status_obj_ref = ray.put(status.value) controller.register_status.remote(status_obj_ref) controller = ray.get_actor("AdaptDLController") supervisor_url = ray.get(controller.get_url.remote()) os.environ["ADAPTDL_MASTER_PORT"] = str(47000 + num_restarts + offset) os.environ["ADAPTDL_REPLICA_RANK"] = str(rank) os.environ["ADAPTDL_NUM_REPLICAS"] = str(replicas) os.environ["ADAPTDL_SUPERVISOR_URL"] = supervisor_url os.environ["ADAPTDL_JOB_ID"] = job_key os.environ["ADAPTDL_NUM_RESTARTS"] = str(num_restarts) os.environ["ADAPTDL_SCHED_VERSION"] = str( pkg_resources.get_distribution("adaptdl").version) suffix = f"{job_uid}-{rank}" checkpoint_path = f"/tmp/checkpoint-{suffix}" try: if os.path.exists(checkpoint_path): import shutil shutil.rmtree(checkpoint_path) os.mkdir(checkpoint_path) if checkpoint: _checkpoint_obj_to_dir(checkpoint_path, checkpoint) num_restarts = int(num_restarts) os.environ["ADAPTDL_CHECKPOINT_PATH"] = str(checkpoint_path) share_path = f"/tmp/share-{suffix}" if not os.path.exists(share_path): os.mkdir(share_path) os.environ["ADAPTDL_SHARE_PATH"] = str(share_path) rank_obj_ref = ray.put(rank) ip_obj_ref = ray.put(services.get_node_ip_address()) controller.register_worker.remote(rank_obj_ref, ip_obj_ref) except Exception as e: logging.info(traceback.format_exc()) time.sleep(5) report_status(Status.FAILED) raise e # TODO: replace with block try: filename = Path(path).name sys.argv = [filename] if argv: # Need to augment the argv to mimic that file being called sys.argv += argv spec = importlib.util.spec_from_file_location("__main__", path) module = importlib.util.module_from_spec(spec) # TODO: fix imports when caller module is not in the root path spec.loader.exec_module(module) time.sleep(5) except SystemExit: # Received a cancel from the controller -- the job is being rescheduled # Worker 0 needs to send the checkpoint back to the controller so the # next generation of workers can resume logging.info(f"Worker {rank} received system exit") if rank == 0: checkpoint_obj = _serialize_checkpoint(checkpoint_path) logging.info("checkpoint created") checkpoint_obj_ref = ray.put(checkpoint_obj) logging.info("checkpoint placed") result = ray.get( controller.register_checkpoint.remote(checkpoint_obj_ref)) logging.info(f"checkpoint registered: {result}") # This sleep is to keep this remote task alive # until its worker object can be killed by the controller time.sleep(1800) except Exception as e: logging.error(traceback.format_exc()) logging.error(e) time.sleep(5) report_status(Status.FAILED) raise e else: if rank == 0: logging.info("Job succeeded, exiting") time.sleep(5) report_status(Status.SUCCEEDED) time.sleep(5)
def _invalid_nodes(self): current_ip = services.get_node_ip_address() return self._terminating_nodes.union({current_ip})
def start(node_ip_address, address, port, redis_password, redis_shard_ports, object_manager_port, node_manager_port, gcs_server_port, min_worker_port, max_worker_port, worker_port_list, memory, object_store_memory, redis_max_memory, num_cpus, num_gpus, resources, head, include_dashboard, dashboard_host, dashboard_port, block, plasma_directory, autoscaling_config, no_redirect_worker_output, no_redirect_output, plasma_store_socket_name, raylet_socket_name, temp_dir, java_worker_options, load_code_from_local, code_search_path, system_config, lru_evict, enable_object_reconstruction, metrics_export_port, log_style, log_color, verbose): """Start Ray processes manually on the local machine.""" cli_logger.configure(log_style, log_color, verbose) if gcs_server_port and not head: raise ValueError( "gcs_server_port can be only assigned when you specify --head.") # Convert hostnames to numerical IP address. if node_ip_address is not None: node_ip_address = services.address_to_ip(node_ip_address) redis_address = None if address is not None: (redis_address, redis_address_ip, redis_address_port) = services.validate_redis_address(address) try: resources = json.loads(resources) except Exception: cli_logger.error("`{}` is not a valid JSON string.", cf.bold("--resources")) cli_logger.abort( "Valid values look like this: `{}`", cf.bold("--resources='\"CustomResource3\": 1, " "\"CustomResource2\": 2}'")) raise Exception("Unable to parse the --resources argument using " "json.loads. Try using a format like\n\n" " --resources='{\"CustomResource1\": 3, " "\"CustomReseource2\": 2}'") redirect_worker_output = None if not no_redirect_worker_output else True redirect_output = None if not no_redirect_output else True ray_params = ray.parameter.RayParams( node_ip_address=node_ip_address, min_worker_port=min_worker_port, max_worker_port=max_worker_port, worker_port_list=worker_port_list, object_manager_port=object_manager_port, node_manager_port=node_manager_port, gcs_server_port=gcs_server_port, memory=memory, object_store_memory=object_store_memory, redis_password=redis_password, redirect_worker_output=redirect_worker_output, redirect_output=redirect_output, num_cpus=num_cpus, num_gpus=num_gpus, resources=resources, plasma_directory=plasma_directory, huge_pages=False, plasma_store_socket_name=plasma_store_socket_name, raylet_socket_name=raylet_socket_name, temp_dir=temp_dir, include_dashboard=include_dashboard, dashboard_host=dashboard_host, dashboard_port=dashboard_port, java_worker_options=java_worker_options, load_code_from_local=load_code_from_local, code_search_path=code_search_path, _system_config=system_config, lru_evict=lru_evict, enable_object_reconstruction=enable_object_reconstruction, metrics_export_port=metrics_export_port) if head: # Use default if port is none, allocate an available port if port is 0 if port is None: port = ray_constants.DEFAULT_PORT if port == 0: with socket() as s: s.bind(("", 0)) port = s.getsockname()[1] num_redis_shards = None # Start Ray on the head node. if redis_shard_ports is not None: redis_shard_ports = redis_shard_ports.split(",") # Infer the number of Redis shards from the ports if the number is # not provided. num_redis_shards = len(redis_shard_ports) if redis_address is not None: cli_logger.abort( "`{}` starts a new Redis server, `{}` should not be set.", cf.bold("--head"), cf.bold("--address")) raise Exception("If --head is passed in, a Redis server will be " "started, so a Redis address should not be " "provided.") node_ip_address = services.get_node_ip_address() # Get the node IP address if one is not provided. ray_params.update_if_absent(node_ip_address=node_ip_address) cli_logger.labeled_value("Local node IP", ray_params.node_ip_address) ray_params.update_if_absent( redis_port=port, redis_shard_ports=redis_shard_ports, redis_max_memory=redis_max_memory, num_redis_shards=num_redis_shards, redis_max_clients=None, autoscaling_config=autoscaling_config, ) # Fail early when starting a new cluster when one is already running if address is None: default_address = f"{node_ip_address}:{port}" redis_addresses = services.find_redis_address(default_address) if len(redis_addresses) > 0: raise ConnectionError( f"Ray is already running at {default_address}. " f"Please specify a different port using the `--port`" f" command to `ray start`.") node = ray.node.Node( ray_params, head=True, shutdown_at_exit=block, spawn_reaper=block) redis_address = node.redis_address # this is a noop if new-style is not set, so the old logger calls # are still in place cli_logger.newline() startup_msg = "Ray runtime started." cli_logger.success("-" * len(startup_msg)) cli_logger.success(startup_msg) cli_logger.success("-" * len(startup_msg)) cli_logger.newline() with cli_logger.group("Next steps"): cli_logger.print( "To connect to this Ray runtime from another node, run") cli_logger.print( cf.bold(" ray start --address='{}'{}"), redis_address, f" --redis-password='******'" if redis_password else "") cli_logger.newline() cli_logger.print("Alternatively, use the following Python code:") with cli_logger.indented(): with cf.with_style("monokai") as c: cli_logger.print("{} ray", c.magenta("import")) cli_logger.print( "ray{}init(address{}{}{})", c.magenta("."), c.magenta("="), c.yellow("'auto'"), ", _redis_password{}{}".format( c.magenta("="), c.yellow("'" + redis_password + "'")) if redis_password else "") cli_logger.newline() cli_logger.print( cf.underlined("If connection fails, check your " "firewall settings and " "network configuration.")) cli_logger.newline() cli_logger.print("To terminate the Ray runtime, run") cli_logger.print(cf.bold(" ray stop")) else: # Start Ray on a non-head node. if not (port is None): cli_logger.abort("`{}` should not be specified without `{}`.", cf.bold("--port"), cf.bold("--head")) raise Exception("If --head is not passed in, --port is not " "allowed.") if redis_shard_ports is not None: cli_logger.abort("`{}` should not be specified without `{}`.", cf.bold("--redis-shard-ports"), cf.bold("--head")) raise Exception("If --head is not passed in, --redis-shard-ports " "is not allowed.") if redis_address is None: cli_logger.abort("`{}` is required unless starting with `{}`.", cf.bold("--address"), cf.bold("--head")) raise Exception("If --head is not passed in, --address must " "be provided.") if include_dashboard: cli_logger.abort("`{}` should not be specified without `{}`.", cf.bold("--include-dashboard"), cf.bold("--head")) raise ValueError( "If --head is not passed in, the --include-dashboard" "flag is not relevant.") # Wait for the Redis server to be started. And throw an exception if we # can't connect to it. services.wait_for_redis_to_start( redis_address_ip, redis_address_port, password=redis_password) # Create a Redis client. redis_client = services.create_redis_client( redis_address, password=redis_password) # Check that the version information on this node matches the version # information that the cluster was started with. services.check_version_info(redis_client) # Get the node IP address if one is not provided. ray_params.update_if_absent( node_ip_address=services.get_node_ip_address(redis_address)) cli_logger.labeled_value("Local node IP", ray_params.node_ip_address) # Check that there aren't already Redis clients with the same IP # address connected with this Redis instance. This raises an exception # if the Redis server already has clients on this node. check_no_existing_redis_clients(ray_params.node_ip_address, redis_client) ray_params.update(redis_address=redis_address) node = ray.node.Node( ray_params, head=False, shutdown_at_exit=block, spawn_reaper=block) cli_logger.newline() startup_msg = "Ray runtime started." cli_logger.success("-" * len(startup_msg)) cli_logger.success(startup_msg) cli_logger.success("-" * len(startup_msg)) cli_logger.newline() cli_logger.print("To terminate the Ray runtime, run") cli_logger.print(cf.bold(" ray stop")) if block: cli_logger.newline() with cli_logger.group(cf.bold("--block")): cli_logger.print( "This command will now block until terminated by a signal.") cli_logger.print( "Runing subprocesses are monitored and a message will be " "printed if any of them terminate unexpectedly.") while True: time.sleep(1) deceased = node.dead_processes() if len(deceased) > 0: cli_logger.newline() cli_logger.error("Some Ray subprcesses exited unexpectedly:") with cli_logger.indented(): for process_type, process in deceased: cli_logger.error( "{}", cf.bold(str(process_type)), _tags={"exit code": str(process.returncode)}) # shutdown_at_exit will handle cleanup. cli_logger.newline() cli_logger.error("Remaining processes will be killed.") sys.exit(1)
root_logger.setLevel(logging.INFO) root_handler = logging.StreamHandler() root_handler.setLevel(logging.INFO) root_handler.setFormatter(logging.Formatter(ray_constants.LOGGER_FORMAT)) root_logger.addHandler(root_handler) if __name__ == "__main__": setup_logging() parser = argparse.ArgumentParser(description="Kuberay Autoscaler") parser.add_argument( "--redis-password", required=False, type=str, default=None, help="The password to use for Redis") args = parser.parse_args() cluster_name = yaml.safe_load( open(AUTOSCALING_CONFIG_PATH).read())["cluster_name"] head_ip = get_node_ip_address() Monitor( address=f"{head_ip}:6379", redis_password=args.redis_password, autoscaling_config=AUTOSCALING_CONFIG_PATH, monitor_ip=head_ip, ).run()