def _create_placement_group(self): """Creates a placement group if it does not exist. If a placement group is already detected (Tune) this will be a no-op. By default the placement group will be created with PACK strategy. This is optimized for colocating GPUs on a minimal number of nodes. This behavior can be overridden to use the SPREAD strategy by defining ``TRAIN_ENABLE_WORKER_SPREAD_ENV`` If a placement group is created it will be stored as self._placement_group. """ current_placement_group = get_current_placement_group() worker = ray._private.worker.global_worker should_capture_child_tasks_in_placement_group = ( worker.should_capture_child_tasks_in_placement_group) should_create_placement_group = ( current_placement_group is None or not should_capture_child_tasks_in_placement_group) if should_create_placement_group: additional_resources_per_worker = ( self._additional_resources_per_worker or {}) bundle = { "CPU": self._num_cpus_per_worker, "GPU": self._num_gpus_per_worker, **additional_resources_per_worker, } bundles = [bundle.copy() for _ in range(self._num_workers)] use_spread = bool(env_integer(TRAIN_ENABLE_WORKER_SPREAD_ENV, 0)) strategy = "SPREAD" if use_spread else "PACK" placement_group = ray.util.placement_group(bundles, strategy=strategy) logger.debug("Waiting for placement group to start.") timeout = env_integer(TRAIN_PLACEMENT_GROUP_TIMEOUT_S_ENV, 100) ready, _ = ray.wait([placement_group.ready()], timeout=timeout) if ready: logger.debug("Placement group has started.") else: raise TimeoutError( "Placement group creation timed out. Make sure your " "cluster either has enough resources or use an " "autoscaling cluster. If you are running on a cluster, " "make sure you specify an address in `ray.init()`, for example, " '`ray.init("auto")`. You can also increase the timeout by setting ' "the TRAIN_PLACEMENT_GROUP_TIMEOUT_S environment variable. " "Current resources available: {}, resources requested by the " "placement group: {}".format(ray.available_resources(), placement_group.bundle_specs)) self._placement_group = placement_group
def start( self, initialization_hook: Optional[Callable[[], None]] = None, train_cls: Optional[Type] = None, train_cls_args: Optional[Tuple] = None, train_cls_kwargs: Optional[Dict] = None, ): """Starts the worker group.""" self._create_placement_group() placement_group = self._placement_group or "default" self.worker_group = WorkerGroup( num_workers=self._num_workers, num_cpus_per_worker=self._num_cpus_per_worker, num_gpus_per_worker=self._num_gpus_per_worker, additional_resources_per_worker=self. _additional_resources_per_worker, actor_cls=train_cls, actor_cls_args=train_cls_args, actor_cls_kwargs=train_cls_kwargs, placement_group=placement_group, ) try: if initialization_hook: self._initialization_hook = initialization_hook self.worker_group.execute(initialization_hook) share_cuda_visible_devices_enabled = bool( env_integer( ENABLE_SHARE_CUDA_VISIBLE_DEVICES_ENV, self._backend.share_cuda_visible_devices, )) if self._num_gpus_per_worker > 0 and share_cuda_visible_devices_enabled: self._share_cuda_visible_devices() self._backend.on_start(self.worker_group, self._backend_config) except RayActorError as exc: logger.exception(str(exc)) logger.warning( "Failure occurred during startup. Restarting all workers and " "attempting to startup again.") self._increment_failures() self._restart()
def start_training( self, train_func: Callable[[], T], dataset_spec: RayDatasetSpec, checkpoint: Optional[Checkpoint] = None, ) -> None: """Executes a training function on all workers in a separate thread. ``finish_training`` should be called after this. Args: train_func: The training function to run on each worker. dataset_spec: A specification for the Ray Dataset to be passed to the training workers, and the logic on how to shard the Ray Dataset. checkpoint: The checkpoint data that should be loaded onto each worker and accessed by the training function via ``train.load_checkpoint()``. If this is ``None`` then no checkpoint will be loaded. """ use_detailed_autofilled_metrics = env_integer( ENABLE_DETAILED_AUTOFILLED_METRICS_ENV, 0) # First initialize the session. def initialize_session( train_func, world_rank, local_rank, world_size, trial_info, checkpoint, dataset_shard, encode_data_fn, ): try: init_session( training_func=train_func, world_rank=world_rank, local_rank=local_rank, world_size=world_size, trial_info=trial_info, dataset_shard=dataset_shard, checkpoint=checkpoint, encode_data_fn=encode_data_fn, detailed_autofilled_metrics=use_detailed_autofilled_metrics, ) except ValueError: raise TrainBackendError( "Attempting to start training but a " "previous training run is still ongoing. " "You must call `finish_training` before " "calling `start_training` again.") if self.dataset_shards is None: actors = [worker.actor for worker in self.worker_group.workers] self.dataset_shards = dataset_spec.get_dataset_shards(actors) local_rank_map = self._create_local_rank_map() futures = [] for index in range(len(self.worker_group)): futures.append( self.worker_group.execute_single_async( index, initialize_session, world_rank=index, local_rank=local_rank_map[index], world_size=len(self.worker_group), trial_info=self._trial_info, train_func=train_func, dataset_shard=self.dataset_shards[index], checkpoint=checkpoint, encode_data_fn=self._backend.encode_data, )) self.get_with_failure_handling(futures) # Run the training function asynchronously in its own thread. def train_async(): session = get_session() session.start() self.worker_group.execute_async(train_async)
from ray._private.ray_constants import env_integer DASHBOARD_LOG_FILENAME = "dashboard.log" DASHBOARD_AGENT_PORT_PREFIX = "DASHBOARD_AGENT_PORT_PREFIX:" DASHBOARD_AGENT_LOG_FILENAME = "dashboard_agent.log" DASHBOARD_AGENT_CHECK_PARENT_INTERVAL_SECONDS = 2 RAY_STATE_SERVER_MAX_HTTP_REQUEST_ENV_NAME = "RAY_STATE_SERVER_MAX_HTTP_REQUEST" # Default number of in-progress requests to the state api server. RAY_STATE_SERVER_MAX_HTTP_REQUEST = env_integer( RAY_STATE_SERVER_MAX_HTTP_REQUEST_ENV_NAME, 100 ) # Max allowed number of in-progress requests could be configured. RAY_STATE_SERVER_MAX_HTTP_REQUEST_ALLOWED = 1000 RETRY_REDIS_CONNECTION_TIMES = 10 CONNECT_REDIS_INTERNAL_SECONDS = 2 PURGE_DATA_INTERVAL_SECONDS = 60 * 10 ORGANIZE_DATA_INTERVAL_SECONDS = 2 DASHBOARD_RPC_ADDRESS = "dashboard_rpc" GCS_SERVER_ADDRESS = "GcsServerAddress" # GCS check alive GCS_CHECK_ALIVE_MAX_COUNT_OF_RPC_ERROR = env_integer( "GCS_CHECK_ALIVE_MAX_COUNT_OF_RPC_ERROR", 10 ) GCS_CHECK_ALIVE_INTERVAL_SECONDS = env_integer("GCS_CHECK_ALIVE_INTERVAL_SECONDS", 5) GCS_CHECK_ALIVE_RPC_TIMEOUT = env_integer("GCS_CHECK_ALIVE_RPC_TIMEOUT", 10) GCS_RETRY_CONNECT_INTERVAL_SECONDS = env_integer( "GCS_RETRY_CONNECT_INTERVAL_SECONDS", 2 ) # aiohttp_cache AIOHTTP_CACHE_TTL_SECONDS = 2
from ray.util.client.common import ( CLIENT_SERVER_MAX_THREADS, GRPC_OPTIONS, OBJECT_TRANSFER_CHUNK_SIZE, ClientServerHandle, ResponseCache, ) from ray.util.client.server.dataservicer import DataServicer from ray.util.client.server.logservicer import LogstreamServicer from ray.util.client.server.proxier import serve_proxier from ray.util.client.server.server_pickler import dumps_from_server, loads_from_client from ray.util.client.server.server_stubs import current_server logger = logging.getLogger(__name__) TIMEOUT_FOR_SPECIFIC_SERVER_S = env_integer("TIMEOUT_FOR_SPECIFIC_SERVER_S", 30) def _use_response_cache(func): """ Decorator for gRPC stubs. Before calling the real stubs, checks if there's an existing entry in the caches. If there is, then return the cached entry. Otherwise, call the real function and use the real cache """ @functools.wraps(func) def wrapper(self, request, context): metadata = {k: v for k, v in context.invocation_metadata()} expected_ids = ("client_id", "thread_id", "req_id") if any(i not in metadata for i in expected_ids): # Missing IDs, skip caching and call underlying stub directly return func(self, request, context)
import ray from ray._private.ray_constants import env_integer from ray.types import ObjectRef from ray.util.annotations import PublicAPI try: import tqdm needs_warning = False except ImportError: tqdm = None needs_warning = True # Whether progress bars are enabled in this thread. _enabled = not bool(env_integer("RAY_DATA_DISABLE_PROGRESS_BARS", 0)) # Used a signal to cancel execution. _canceled_threads = set() _canceled_threads_lock = threading.Lock() @PublicAPI def set_progress_bars(enabled: bool) -> bool: """Set whether progress bars are enabled. The default behavior is controlled by the ``RAY_DATA_DISABLE_PROGRESS_BARS`` environment variable. By default, it is set to "0". Setting it to "1" will disable progress bars, unless they are reenabled by this method.
from ray._private.ray_constants import env_integer from ray.core.generated import event_pb2 LOG_ERROR_EVENT_STRING_LENGTH_LIMIT = 1000 RETRY_CONNECT_TO_DASHBOARD_INTERVAL_SECONDS = 2 # Monitor events SCAN_EVENT_DIR_INTERVAL_SECONDS = env_integer( "SCAN_EVENT_DIR_INTERVAL_SECONDS", 2) SCAN_EVENT_START_OFFSET_SECONDS = -30 * 60 CONCURRENT_READ_LIMIT = 50 EVENT_READ_LINE_COUNT_LIMIT = 200 EVENT_READ_LINE_LENGTH_LIMIT = env_integer("EVENT_READ_LINE_LENGTH_LIMIT", 2 * 1024 * 1024) # 2MB # Report events EVENT_AGENT_REPORT_INTERVAL_SECONDS = 0.1 EVENT_AGENT_RETRY_TIMES = 10 EVENT_AGENT_CACHE_SIZE = 10240 # Event sources EVENT_HEAD_MONITOR_SOURCE_TYPES = [ event_pb2.Event.SourceType.Name(event_pb2.Event.GCS) ] EVENT_AGENT_MONITOR_SOURCE_TYPES = list( set(event_pb2.Event.SourceType.keys()) - set(EVENT_HEAD_MONITOR_SOURCE_TYPES)) EVENT_SOURCE_ALL = event_pb2.Event.SourceType.keys()
from ray._private.ray_constants import env_integer DASHBOARD_LOG_FILENAME = "dashboard.log" DASHBOARD_AGENT_PORT_PREFIX = "DASHBOARD_AGENT_PORT_PREFIX:" DASHBOARD_AGENT_LOG_FILENAME = "dashboard_agent.log" DASHBOARD_AGENT_CHECK_PARENT_INTERVAL_SECONDS = 2 RETRY_REDIS_CONNECTION_TIMES = 10 CONNECT_REDIS_INTERNAL_SECONDS = 2 PURGE_DATA_INTERVAL_SECONDS = 60 * 10 ORGANIZE_DATA_INTERVAL_SECONDS = 2 DASHBOARD_RPC_ADDRESS = "dashboard_rpc" GCS_SERVER_ADDRESS = "GcsServerAddress" # GCS check alive GCS_CHECK_ALIVE_MAX_COUNT_OF_RPC_ERROR = env_integer( "GCS_CHECK_ALIVE_MAX_COUNT_OF_RPC_ERROR", 10) GCS_CHECK_ALIVE_INTERVAL_SECONDS = env_integer( "GCS_CHECK_ALIVE_INTERVAL_SECONDS", 5) GCS_CHECK_ALIVE_RPC_TIMEOUT = env_integer("GCS_CHECK_ALIVE_RPC_TIMEOUT", 10) GCS_RETRY_CONNECT_INTERVAL_SECONDS = env_integer( "GCS_RETRY_CONNECT_INTERVAL_SECONDS", 2) # aiohttp_cache AIOHTTP_CACHE_TTL_SECONDS = 2 AIOHTTP_CACHE_MAX_SIZE = 128 AIOHTTP_CACHE_DISABLE_ENVIRONMENT_KEY = "RAY_DASHBOARD_NO_CACHE" # Named signals SIGNAL_NODE_INFO_FETCHED = "node_info_fetched" SIGNAL_NODE_SUMMARY_FETCHED = "node_summary_fetched" SIGNAL_JOB_INFO_FETCHED = "job_info_fetched" SIGNAL_WORKER_INFO_FETCHED = "worker_info_fetched" # Default value for datacenter (the default value in protobuf) DEFAULT_LANGUAGE = "PYTHON"
import ray._private.ray_constants as ray_constants RUNTIME_ENV_RETRY_TIMES = ray_constants.env_integer("RUNTIME_ENV_RETRY_TIMES", 3) RUNTIME_ENV_RETRY_INTERVAL_MS = ray_constants.env_integer( "RUNTIME_ENV_RETRY_INTERVAL_MS", 1000 )
import ray._private.ray_constants as ray_constants REPORTER_PREFIX = "RAY_REPORTER:" # The reporter will report its statistics this often (milliseconds). REPORTER_UPDATE_INTERVAL_MS = ray_constants.env_integer( "REPORTER_UPDATE_INTERVAL_MS", 2500)