def _handle_failure(self, error): logger.exception("Error in monitor loop") if self.autoscaler is not None and \ os.environ.get("RAY_AUTOSCALER_FATESHARE_WORKERS", "") == "1": self.autoscaler.kill_workers() # Take down autoscaler workers if necessary. self.destroy_autoscaler_workers() # Something went wrong, so push an error to all current and future # drivers. message = f"The autoscaler failed with the following error:\n{error}" if _internal_kv_initialized(): _internal_kv_put(DEBUG_AUTOSCALING_ERROR, message, overwrite=True) redis_client = ray._private.services.create_redis_client( self.redis_address, password=self.redis_password) gcs_publisher = None if args.gcs_address: gcs_publisher = GcsPublisher(address=args.gcs_address) elif gcs_pubsub_enabled(): gcs_publisher = GcsPublisher( address=get_gcs_address_from_redis(redis_client)) from ray._private.utils import publish_error_to_driver publish_error_to_driver( ray_constants.MONITOR_DIED_ERROR, message, redis_client=redis_client, gcs_publisher=gcs_publisher)
def upload_runtime_env_package_if_needed(job_config: JobConfig) -> None: """Upload runtime env if it's not there. It'll check whether the runtime environment exists in the cluster or not. If it doesn't exist, a package will be created based on the working directory and modules defined in job config. The package will be uploaded to the cluster after this. Args: job_config (JobConfig): The job config of driver. """ assert _internal_kv_initialized() pkg_uris = job_config.get_runtime_env_uris() for pkg_uri in pkg_uris: if not package_exists(pkg_uri): file_path = _get_local_path(pkg_uri) pkg_file = Path(file_path) working_dir = job_config.runtime_env.get("working_dir") py_modules = job_config.runtime_env.get("py_modules") excludes = job_config.runtime_env.get("excludes") or [] logger.info(f"{pkg_uri} doesn't exist. Create new package with" f" {working_dir} and {py_modules}") if not pkg_file.exists(): create_project_package(working_dir, py_modules, excludes, file_path) # Push the data to remote storage pkg_size = push_package(pkg_uri, pkg_file) logger.info(f"{pkg_uri} has been pushed with {pkg_size} bytes")
def __init__(self, dashboard_agent): super().__init__(dashboard_agent) self._runtime_env_dir = dashboard_agent.runtime_env_dir self._logging_params = dashboard_agent.logging_params self._per_job_logger_cache = dict() # Cache the results of creating envs to avoid repeatedly calling into # conda and other slow calls. self._env_cache: Dict[str, CreatedEnvResult] = dict() # Maps a serialized runtime env to a lock that is used # to prevent multiple concurrent installs of the same env. self._env_locks: Dict[str, asyncio.Lock] = dict() # Keeps track of the URIs contained within each env so we can # invalidate the env cache when a URI is deleted. # This is a temporary mechanism until we have per-URI caching. self._uris_to_envs: Dict[str, Set[str]] = defaultdict(set) # Initialize internal KV to be used by the working_dir setup code. _initialize_internal_kv(self._dashboard_agent.gcs_client) assert _internal_kv_initialized() self._pip_manager = PipManager(self._runtime_env_dir) self._conda_manager = CondaManager(self._runtime_env_dir) self._py_modules_manager = PyModulesManager(self._runtime_env_dir) self._working_dir_manager = WorkingDirManager(self._runtime_env_dir) self._container_manager = ContainerManager(dashboard_agent.temp_dir) self._working_dir_uri_cache = URICache( self._working_dir_manager.delete_uri, WORKING_DIR_CACHE_SIZE_BYTES) self._py_modules_uri_cache = URICache( self._py_modules_manager.delete_uri, PY_MODULES_CACHE_SIZE_BYTES) self._conda_uri_cache = URICache(self._conda_manager.delete_uri, CONDA_CACHE_SIZE_BYTES) self._pip_uri_cache = URICache(self._pip_manager.delete_uri, PIP_CACHE_SIZE_BYTES) self._logger = default_logger
def _run(self): """Run the monitor loop.""" while True: if self.stop_event and self.stop_event.is_set(): break self.update_load_metrics() self.update_resource_requests() self.update_event_summary() status = { "load_metrics_report": self.load_metrics.summary()._asdict(), "time": time.time(), "monitor_pid": os.getpid() } # Process autoscaling actions if self.autoscaler: # Only used to update the load metrics for the autoscaler. self.autoscaler.update() status[ "autoscaler_report"] = self.autoscaler.summary()._asdict() for msg in self.event_summarizer.summary(): logger.info("{}{}".format( ray_constants.LOG_PREFIX_EVENT_SUMMARY, msg)) self.event_summarizer.clear() as_json = json.dumps(status) if _internal_kv_initialized(): _internal_kv_put( DEBUG_AUTOSCALING_STATUS, as_json, overwrite=True) # Wait for a autoscaler update interval before processing the next # round of messages. time.sleep(AUTOSCALER_UPDATE_INTERVAL_S)
def _handle_failure(self, error): logger.exception("Error in monitor loop") if ( self.autoscaler is not None and os.environ.get("RAY_AUTOSCALER_FATESHARE_WORKERS", "") == "1" ): self.autoscaler.kill_workers() # Take down autoscaler workers if necessary. self.destroy_autoscaler_workers() # Something went wrong, so push an error to all current and future # drivers. message = f"The autoscaler failed with the following error:\n{error}" if _internal_kv_initialized(): _internal_kv_put( ray_constants.DEBUG_AUTOSCALING_ERROR, message, overwrite=True ) gcs_publisher = GcsPublisher(address=args.gcs_address) from ray._private.utils import publish_error_to_driver publish_error_to_driver( ray_constants.MONITOR_DIED_ERROR, message, gcs_publisher=gcs_publisher, )
def _run(self): """Run the monitor loop.""" while True: self.update_raylet_map() self.update_load_metrics() self.update_resource_requests() status = { "load_metrics_report": self.load_metrics.summary()._asdict() } # Process autoscaling actions if self.autoscaler: # Only used to update the load metrics for the autoscaler. self.autoscaler.update() status["autoscaler_report"] = self.autoscaler.summary( )._asdict() as_json = json.dumps(status) if _internal_kv_initialized(): _internal_kv_put(DEBUG_AUTOSCALING_STATUS, as_json, overwrite=True) # Wait for a autoscaler update interval before processing the next # round of messages. time.sleep(AUTOSCALER_UPDATE_INTERVAL_S)
def __init__(self, namespace: str = None): assert ray_kv._internal_kv_initialized() if namespace is not None and not isinstance(namespace, str): raise TypeError("namespace must a string, got: {}.".format( type(namespace))) self.namespace = namespace or ""
def ensure_runtime_env_setup(pkg_uris: List[str]) -> Optional[str]: """Make sure all required packages are downloaded it local. Necessary packages required to run the job will be downloaded into local file system if it doesn't exist. Args: pkg_uri list(str): Package of the working dir for the runtime env. Return: Working directory is returned if the pkg_uris is not empty, otherwise, None is returned. """ pkg_dir = None assert _internal_kv_initialized() for pkg_uri in pkg_uris: # For each node, the package will only be downloaded one time # Locking to avoid multiple process download concurrently pkg_file = Path(_get_local_path(pkg_uri)) with FileLock(str(pkg_file) + ".lock"): pkg_dir = fetch_package(pkg_uri) sys.path.insert(0, str(pkg_dir)) # Right now, multiple pkg_uris are not supported correctly. # We return the last one as working directory return str(pkg_dir) if pkg_dir else None
def register(self, category, key, value): if category not in KNOWN_CATEGORIES: from ray.tune import TuneError raise TuneError("Unknown category {} not among {}".format( category, KNOWN_CATEGORIES)) self._to_flush[(category, key)] = pickle.dumps(value) if _internal_kv_initialized(): self.flush_values()
def __init__(self, dashboard_head): super().__init__(dashboard_head) self._gcs_job_info_stub = None self._gcs_actor_info_stub = None self._dashboard_head = dashboard_head assert _internal_kv_initialized() self._job_status_client = JobStatusStorageClient()
def _run(self): """Run the monitor loop.""" while True: try: if self.stop_event and self.stop_event.is_set(): break self.update_load_metrics() self.update_resource_requests() self.update_event_summary() status = { "load_metrics_report": asdict(self.load_metrics.summary()), "time": time.time(), "monitor_pid": os.getpid(), } if self.autoscaler and not self.load_metrics: # load_metrics is Falsey iff we haven't collected any # resource messages from the GCS, which can happen at startup if # the GCS hasn't yet received data from the Raylets. # In this case, do not do an autoscaler update. # Wait to get load metrics. logger.info( "Autoscaler has not yet received load metrics. Waiting." ) elif self.autoscaler: # Process autoscaling actions self.autoscaler.update() autoscaler_summary = self.autoscaler.summary() if autoscaler_summary: status["autoscaler_report"] = asdict(autoscaler_summary) for msg in self.event_summarizer.summary(): # Need to prefix each line of the message for the lines to # get pushed to the driver logs. for line in msg.split("\n"): logger.info( "{}{}".format( ray_constants.LOG_PREFIX_EVENT_SUMMARY, line ) ) self.event_summarizer.clear() as_json = json.dumps(status) if _internal_kv_initialized(): _internal_kv_put( ray_constants.DEBUG_AUTOSCALING_STATUS, as_json, overwrite=True ) except Exception: # By default, do not exit the monitor on failure. if self.retry_on_failure: logger.exception("Monitor: Execution exception. Trying again...") else: raise # Wait for a autoscaler update interval before processing the next # round of messages. time.sleep(AUTOSCALER_UPDATE_INTERVAL_S)
def _put_pre_init_library_usages(): assert _internal_kv_initialized() # NOTE: When the lib is imported from a worker, ray should # always be initialized, so there's no need to register the # pre init hook. if ray._private.worker.global_worker.mode != ray.SCRIPT_MODE: return for library_usage in _recorded_library_usages: _put_library_usage(library_usage)
def __init__(self, dashboard_head): super().__init__(dashboard_head) self._gcs_job_info_stub = None self._gcs_actor_info_stub = None self._dashboard_head = dashboard_head # Initialize internal KV to be used by the working_dir setup code. _initialize_internal_kv(dashboard_head.gcs_client) assert _internal_kv_initialized()
def get(self, category, key): if _internal_kv_initialized(): value = _internal_kv_get(_make_key(category, key)) if value is None: raise ValueError( "Registry value for {}/{} doesn't exist.".format( category, key)) return pickle.loads(value) else: return pickle.loads(self._to_flush[(category, key)])
def __init__(self, dashboard_head): super().__init__(dashboard_head) self._gcs_job_info_stub = None self._gcs_actor_info_stub = None self._dashboard_head = dashboard_head assert _internal_kv_initialized() self._job_info_client = JobInfoStorageClient() # For offloading CPU intensive work. self._thread_pool = concurrent.futures.ThreadPoolExecutor( max_workers=2, thread_name_prefix="api_head")
def _put_library_usage(library_usage: str): assert _internal_kv_initialized() try: _internal_kv_put( f"{usage_constant.LIBRARY_USAGE_PREFIX}{library_usage}", "", namespace=usage_constant.USAGE_STATS_NAMESPACE, ) except Exception as e: logger.debug(f"Failed to put library usage, {e}")
def update_resource_requests(self): """Fetches resource requests from the internal KV and updates load.""" if not _internal_kv_initialized(): return data = _internal_kv_get(ray.ray_constants.AUTOSCALER_RESOURCE_REQUEST_CHANNEL) if data: try: resource_request = json.loads(data) self.load_metrics.set_resource_requests(resource_request) except Exception: logger.exception("Error parsing resource requests")
def log_info_string(self, nodes): tmp = "Cluster status: " tmp += self.info_string(nodes) tmp += "\n" tmp += self.load_metrics.info_string() tmp += "\n" tmp += self.resource_demand_scheduler.debug_string( nodes, self.pending_launches.breakdown(), self.load_metrics.get_resource_utilization()) if _internal_kv_initialized(): _internal_kv_put(DEBUG_AUTOSCALING_STATUS, tmp, overwrite=True) logger.debug(tmp)
def register(self, category, key, value): """Registers the value with the global registry. Raises: PicklingError if unable to pickle to provided file. """ if category not in KNOWN_CATEGORIES: from ray.tune import TuneError raise TuneError("Unknown category {} not among {}".format( category, KNOWN_CATEGORIES)) self._to_flush[(category, key)] = pickle.dumps_debug(value) if _internal_kv_initialized(): self.flush_values()
def run(self): # Register signal handlers for autoscaler termination. signal.signal(signal.SIGINT, self._signal_handler) signal.signal(signal.SIGTERM, self._signal_handler) try: if _internal_kv_initialized(): # Delete any previous autoscaling errors. _internal_kv_del(DEBUG_AUTOSCALING_ERROR) self._initialize_autoscaler() self._run() except Exception: self._handle_failure(traceback.format_exc()) raise
def record_library_usage(library_usage: str): """Record library usage (e.g. which library is used)""" if library_usage in _recorded_library_usages: return _recorded_library_usages.add(library_usage) if not _internal_kv_initialized(): # This happens if the library is imported before ray.init return # Only report library usage from driver to reduce # the load to kv store. if ray.worker.global_worker.mode == ray.SCRIPT_MODE: _put_library_usage(library_usage)
def legacy_log_info_string(autoscaler, nodes): tmp = "Cluster status: " tmp += info_string(autoscaler, nodes) tmp += "\n" tmp += autoscaler.load_metrics.info_string() tmp += "\n" tmp += autoscaler.resource_demand_scheduler.debug_string( nodes, autoscaler.pending_launches.breakdown(), autoscaler.load_metrics.get_resource_utilization(), ) if _internal_kv_initialized(): _internal_kv_put(DEBUG_AUTOSCALING_STATUS_LEGACY, tmp, overwrite=True) logger.debug(tmp)
def __init__(self, dashboard_agent): super().__init__(dashboard_agent) self._runtime_env_dir = dashboard_agent.runtime_env_dir self._logging_params = dashboard_agent.logging_params self._per_job_logger_cache = dict() # Cache the results of creating envs to avoid repeatedly calling into # conda and other slow calls. self._env_cache: Dict[str, CreatedEnvResult] = dict() # Maps a serialized runtime env to a lock that is used # to prevent multiple concurrent installs of the same env. self._env_locks: Dict[str, asyncio.Lock] = dict() # Initialize internal KV to be used by the working_dir setup code. _initialize_internal_kv(self._dashboard_agent.gcs_client) assert _internal_kv_initialized()
def update(self): try: self.reset(errors_fatal=False) self._update() except Exception as e: logger.exception("StandardAutoscaler: " "Error during autoscaling.") if _internal_kv_initialized(): _internal_kv_put( DEBUG_AUTOSCALING_ERROR, str(e), overwrite=True) self.num_failures += 1 if self.num_failures > self.max_failures: logger.critical("StandardAutoscaler: " "Too many errors, abort.") raise e
def package_exists(pkg_uri: str) -> bool: """Check whether the package with given uri exists or not. Args: pkg_uri (str): The uri of the package Return: True for package existing and False for not. """ assert _internal_kv_initialized() (protocol, pkg_name) = _parse_uri(pkg_uri) if protocol in (Protocol.GCS, Protocol.PIN_GCS): return _internal_kv_exists(pkg_uri) else: raise NotImplementedError(f"Protocol {protocol} is not supported")
def record_library_usage(library_usage: str): """Record library usage (e.g. which library is used)""" if library_usage in _recorded_library_usages: return _recorded_library_usages.add(library_usage) if not _internal_kv_initialized(): # This happens if the library is imported before ray.init return # Only report lib usage for driver / workers. Otherwise, # it can be reported if the library is imported from # e.g., API server. if (ray._private.worker.global_worker.mode == ray.SCRIPT_MODE or ray._private.worker.global_worker.mode == ray.WORKER_MODE): _put_library_usage(library_usage)
def _handle_failure(self, error): logger.exception("Error in monitor loop") if self.autoscaler is not None: self.autoscaler.kill_workers() # Take down autoscaler workers if necessary. self.destroy_autoscaler_workers() # Something went wrong, so push an error to all current and future # drivers. message = f"The autoscaler failed with the following error:\n{error}" if _internal_kv_initialized(): _internal_kv_put(DEBUG_AUTOSCALING_ERROR, message, overwrite=True) redis_client = ray._private.services.create_redis_client( args.redis_address, password=args.redis_password) from ray.utils import push_error_to_driver_through_redis push_error_to_driver_through_redis(redis_client, ray_constants.MONITOR_DIED_ERROR, message)
def _run(self): """Run the monitor loop.""" while True: try: if self.stop_event and self.stop_event.is_set(): break self.update_load_metrics() self.update_resource_requests() self.update_event_summary() status = { "load_metrics_report": asdict(self.load_metrics.summary()), "time": time.time(), "monitor_pid": os.getpid(), } # Process autoscaling actions if self.autoscaler: # Only used to update the load metrics for the autoscaler. self.autoscaler.update() status["autoscaler_report"] = asdict(self.autoscaler.summary()) for msg in self.event_summarizer.summary(): # Need to prefix each line of the message for the lines to # get pushed to the driver logs. for line in msg.split("\n"): logger.info( "{}{}".format( ray_constants.LOG_PREFIX_EVENT_SUMMARY, line ) ) self.event_summarizer.clear() as_json = json.dumps(status) if _internal_kv_initialized(): _internal_kv_put( ray_constants.DEBUG_AUTOSCALING_STATUS, as_json, overwrite=True ) except Exception: logger.exception("Monitor: Execution exception. Trying again...") # Wait for a autoscaler update interval before processing the next # round of messages. time.sleep(AUTOSCALER_UPDATE_INTERVAL_S)
def update(self): try: self.reset(errors_fatal=False) self._update() except Exception as e: logger.exception("StandardAutoscaler: " "Error during autoscaling.") if _internal_kv_initialized(): _internal_kv_put( DEBUG_AUTOSCALING_ERROR, str(e), overwrite=True) # Don't abort the autoscaler if the K8s API server is down. # https://github.com/ray-project/ray/issues/12255 is_k8s_connection_error = ( self.config["provider"]["type"] == "kubernetes" and isinstance(e, MaxRetryError)) if not is_k8s_connection_error: self.num_failures += 1 if self.num_failures > self.max_failures: logger.critical("StandardAutoscaler: " "Too many errors, abort.") raise e
def record_library_usage(library_usage: str): """Record library usage (e.g. which library is used)""" if library_usage in _recorded_library_usages: return if "-" in library_usage: # - is not permitted since it should be used as a separator # of the lib usage file name. See LibUsageRecorder. raise ValueError( "The library name contains a char - which is not permitted.") _recorded_library_usages.add(library_usage) if not _internal_kv_initialized(): # This happens if the library is imported before ray.init return # Only report lib usage for driver / workers. Otherwise, # it can be reported if the library is imported from # e.g., API server. if (ray._private.worker.global_worker.mode == ray.SCRIPT_MODE or ray._private.worker.global_worker.mode == ray.WORKER_MODE): _put_library_usage(library_usage)
def contains(self, category, key): if _internal_kv_initialized(): value = _internal_kv_get(_make_key(category, key)) return value is not None else: return (category, key) in self._to_flush