def wrap_function(train_func, warn=True): if hasattr(train_func, "__mixins__"): inherit_from = train_func.__mixins__ + (FunctionRunner, ) else: inherit_from = (FunctionRunner, ) func_args = inspect.getfullargspec(train_func).args use_checkpoint = detect_checkpoint_function(train_func) use_config_single = detect_config_single(train_func) use_reporter = detect_reporter(train_func) if not any([use_checkpoint, use_config_single, use_reporter]): # use_reporter is hidden raise ValueError( "Unknown argument found in the Trainable function. " "The function args must include a 'config' positional " "parameter. Any other args must be 'checkpoint_dir'. " "Found: {}".format(func_args)) if use_config_single and not use_checkpoint: if log_once("tune_function_checkpoint") and warn: logger.warning( "Function checkpointing is disabled. This may result in " "unexpected behavior when using checkpointing features or " "certain schedulers. To enable, set the train function " "arguments to be `func(config, checkpoint_dir=None)`.") class ImplicitFunc(*inherit_from): _name = train_func.__name__ if hasattr(train_func, "__name__") \ else "func" def _trainable_func(self, config, reporter, checkpoint_dir): if not use_checkpoint and not use_reporter: output = train_func(config) elif use_checkpoint: output = train_func(config, checkpoint_dir=checkpoint_dir) else: output = train_func(config, reporter) # If train_func returns, we need to notify the main event loop # of the last result while avoiding double logging. This is done # with the keyword RESULT_DUPLICATE -- see tune/trial_runner.py. reporter(**{RESULT_DUPLICATE: True}) return output return ImplicitFunc
def DistributedTrainableCreator(func: Callable, num_workers: int = 1, num_cpus_per_worker: int = 1, num_gpus_per_worker: int = 0, num_workers_per_host: Optional[int] = None, backend: str = "gloo", timeout_s: int = NCCL_TIMEOUT_S, use_gpu=None) -> Type[_TorchTrainable]: """Creates a class that executes distributed training. Similar to running `torch.distributed.launch`. Note that you typically should not instantiate the object created. Args: func (callable): This function is a Tune trainable function. This function must have 2 args in the signature, and the latter arg must contain `checkpoint_dir`. For example: `func(config, checkpoint_dir=None)`. num_workers (int): Number of training workers to include in world. num_cpus_per_worker (int): Number of CPU resources to reserve per training worker. num_gpus_per_worker (int): Number of GPU resources to reserve per training worker. num_workers_per_host: Optional[int]: Number of workers to colocate per host. backend (str): One of "gloo", "nccl". timeout_s (float): Seconds before the torch process group times out. Useful when machines are unreliable. Defaults to 60 seconds. This value is also reused for triggering placement timeouts if forcing colocation. Returns: type(Trainable): A trainable class object that can be passed to Tune. Resources are automatically set within the object, so users do not need to set `resources_per_trainable`. Example: .. code-block:: python trainable_cls = DistributedTrainableCreator( train_func, num_workers=2) analysis = tune.run(trainable_cls) """ if use_gpu: raise ValueError( "use_gpu is deprecated. Use 'num_gpus_per_worker' instead.") detect_checkpoint_function(func, abort=True) if num_workers_per_host: if num_workers % num_workers_per_host: raise ValueError("`num_workers` must be an integer multiple " "of workers_per_node.") class WrappedDistributedTorchTrainable(_TorchTrainable): _function = func _num_workers = num_workers _num_cpus_per_worker = num_cpus_per_worker _num_gpus_per_worker = num_gpus_per_worker _num_workers_per_host = num_workers_per_host _timeout_s = timeout_s @classmethod def default_process_group_parameters(self) -> Dict: return dict(timeout=timedelta(timeout_s), backend=backend) @classmethod def default_resource_request(cls, config: Dict) -> Resources: return Resources(cpu=0, gpu=0, extra_cpu=num_cpus_per_worker * num_workers, extra_gpu=num_gpus_per_worker * num_workers) return WrappedDistributedTorchTrainable
def with_parameters(trainable, **kwargs): """Wrapper for trainables to pass arbitrary large data objects. This wrapper function will store all passed parameters in the Ray object store and retrieve them when calling the function. It can thus be used to pass arbitrary data, even datasets, to Tune trainables. This can also be used as an alternative to ``functools.partial`` to pass default arguments to trainables. When used with the function API, the trainable function is called with the passed parameters as keyword arguments. When used with the class API, the ``Trainable.setup()`` method is called with the respective kwargs. If the data already exists in the object store (are instances of ObjectRef), using ``tune.with_parameters()`` is not necessary. You can instead pass the object refs to the training function via the ``config`` or use Python partials. Args: trainable: Trainable to wrap. **kwargs: parameters to store in object store. Function API example: .. code-block:: python from ray import tune def train(config, data=None): for sample in data: loss = update_model(sample) tune.report(loss=loss) data = HugeDataset(download=True) tune.run( tune.with_parameters(train, data=data), # ... ) Class API example: .. code-block:: python from ray import tune class MyTrainable(tune.Trainable): def setup(self, config, data=None): self.data = data self.iter = iter(self.data) self.next_sample = next(self.iter) def step(self): loss = update_model(self.next_sample) try: self.next_sample = next(self.iter) except StopIteration: return {"loss": loss, done: True} return {"loss": loss} data = HugeDataset(download=True) tune.run( tune.with_parameters(MyTrainable, data=data), # ... ) """ from ray.tune.trainable import Trainable if not callable(trainable) or (inspect.isclass(trainable) and not issubclass(trainable, Trainable)): raise ValueError( f"`tune.with_parameters() only works with function trainables " f"or classes that inherit from `tune.Trainable()`. Got type: " f"{type(trainable)}.") parameter_registry = _ParameterRegistry() ray.worker._post_init_hooks.append(parameter_registry.flush) # Objects are moved into the object store prefix = f"{str(trainable)}_" for k, v in kwargs.items(): parameter_registry.put(prefix + k, v) trainable_name = getattr(trainable, "__name__", "tune_with_parameters") if inspect.isclass(trainable): # Class trainable keys = list(kwargs.keys()) class _Inner(trainable): def setup(self, config): setup_kwargs = {} for k in keys: setup_kwargs[k] = parameter_registry.get(prefix + k) super(_Inner, self).setup(config, **setup_kwargs) _Inner.__name__ = trainable_name return _Inner else: # Function trainable use_checkpoint = detect_checkpoint_function(trainable, partial=True) keys = list(kwargs.keys()) def inner(config, checkpoint_dir=None): fn_kwargs = {} if use_checkpoint: default = checkpoint_dir sig = inspect.signature(trainable) if "checkpoint_dir" in sig.parameters: default = sig.parameters[ "checkpoint_dir"].default or default fn_kwargs["checkpoint_dir"] = default for k in keys: fn_kwargs[k] = parameter_registry.get(prefix + k) trainable(config, **fn_kwargs) inner.__name__ = trainable_name # Use correct function signature if no `checkpoint_dir` parameter # is set if not use_checkpoint: def _inner(config): inner(config, checkpoint_dir=None) _inner.__name__ = trainable_name if hasattr(trainable, "__mixins__"): _inner.__mixins__ = trainable.__mixins__ return _inner if hasattr(trainable, "__mixins__"): inner.__mixins__ = trainable.__mixins__ return inner
def __init__(self, name, run, stop=None, time_budget_s=None, config=None, resources_per_trial=None, num_samples=1, local_dir=None, upload_dir=None, trial_name_creator=None, trial_dirname_creator=None, loggers=None, log_to_file=False, sync_to_driver=None, checkpoint_freq=0, checkpoint_at_end=False, sync_on_checkpoint=True, keep_checkpoints_num=None, checkpoint_score_attr=None, export_formats=None, max_failures=0, restore=None): if loggers is not None: # Most users won't run into this as `tune.run()` does not pass # the argument anymore. However, we will want to inform users # if they instantiate their `Experiment` objects themselves. raise ValueError( "Passing `loggers` to an `Experiment` is deprecated. Use " "an `ExperimentLogger` callback instead, e.g. by passing the " "`Logger` classes to `tune.logger.LegacyExperimentLogger` and " "passing this as part of the `callback` parameter to " "`tune.run()`.") config = config or {} if callable(run) and detect_checkpoint_function(run): if checkpoint_at_end: raise ValueError("'checkpoint_at_end' cannot be used with a " "checkpointable function. You can specify " "and register checkpoints within " "your trainable function.") if checkpoint_freq: raise ValueError( "'checkpoint_freq' cannot be used with a " "checkpointable function. You can specify checkpoints " "within your trainable function.") self._run_identifier = Experiment.register_if_needed(run) self.name = name or self._run_identifier # If the name has been set explicitly, we don't want to create # dated directories. The same is true for string run identifiers. if int(os.environ.get("TUNE_DISABLE_DATED_SUBDIR", 0)) == 1 or name \ or isinstance(run, str): self.dir_name = self.name else: self.dir_name = "{}_{}".format(self.name, date_str()) if upload_dir: self.remote_checkpoint_dir = os.path.join(upload_dir, self.dir_name) else: self.remote_checkpoint_dir = None self._stopper = None stopping_criteria = {} if not stop: pass elif isinstance(stop, dict): stopping_criteria = stop elif callable(stop): if FunctionStopper.is_valid_function(stop): self._stopper = FunctionStopper(stop) elif issubclass(type(stop), Stopper): self._stopper = stop else: raise ValueError("Provided stop object must be either a dict, " "a function, or a subclass of " "`ray.tune.Stopper`.") else: raise ValueError("Invalid stop criteria: {}. Must be a " "callable or dict".format(stop)) if time_budget_s: if self._stopper: self._stopper = CombinedStopper(self._stopper, TimeoutStopper(time_budget_s)) else: self._stopper = TimeoutStopper(time_budget_s) _raise_on_durable(self._run_identifier, sync_to_driver, upload_dir) stdout_file, stderr_file = _validate_log_to_file(log_to_file) spec = { "run": self._run_identifier, "stop": stopping_criteria, "config": config, "resources_per_trial": resources_per_trial, "num_samples": num_samples, "local_dir": os.path.abspath( os.path.expanduser(local_dir or DEFAULT_RESULTS_DIR)), "upload_dir": upload_dir, "remote_checkpoint_dir": self.remote_checkpoint_dir, "trial_name_creator": trial_name_creator, "trial_dirname_creator": trial_dirname_creator, "loggers": loggers, "log_to_file": (stdout_file, stderr_file), "sync_to_driver": sync_to_driver, "checkpoint_freq": checkpoint_freq, "checkpoint_at_end": checkpoint_at_end, "sync_on_checkpoint": sync_on_checkpoint, "keep_checkpoints_num": keep_checkpoints_num, "checkpoint_score_attr": checkpoint_score_attr, "export_formats": export_formats or [], "max_failures": max_failures, "restore": os.path.abspath(os.path.expanduser(restore)) if restore else None } self.spec = spec
def with_parameters(fn, **kwargs): """Wrapper for function trainables to pass arbitrary large data objects. This wrapper function will store all passed parameters in the Ray object store and retrieve them when calling the function. It can thus be used to pass arbitrary data, even datasets, to Tune trainable functions. This can also be used as an alternative to `functools.partial` to pass default arguments to trainables. Args: fn: function to wrap **kwargs: parameters to store in object store. .. code-block:: python from ray import tune def train(config, data=None): for sample in data: # ... tune.report(loss=loss) data = HugeDataset(download=True) tune.run( tune.with_parameters(train, data=data), #... ) """ if not callable(fn): raise ValueError( "`tune.with_parameters()` only works with the function API. " "If you want to pass parameters to Trainable _classes_, consider " "passing them via the `config` parameter.") prefix = f"{str(fn)}_" for k, v in kwargs.items(): parameter_registry.put(prefix + k, v) use_checkpoint = detect_checkpoint_function(fn) keys = list(kwargs.keys()) def inner(config, checkpoint_dir=None): fn_kwargs = {} if use_checkpoint: default = checkpoint_dir sig = inspect.signature(fn) if "checkpoint_dir" in sig.parameters: default = sig.parameters["checkpoint_dir"].default \ or default fn_kwargs["checkpoint_dir"] = default for k in keys: fn_kwargs[k] = parameter_registry.get(prefix + k) fn(config, **fn_kwargs) # Use correct function signature if no `checkpoint_dir` parameter is set if not use_checkpoint: def _inner(config): inner(config, checkpoint_dir=None) if hasattr(fn, "__mixins__"): _inner.__mixins__ = fn.__mixins__ return _inner if hasattr(fn, "__mixins__"): inner.__mixins__ = fn.__mixins__ return inner
def DistributedTrainableCreator( func: Callable, use_gpu: bool = False, num_workers: int = 1, num_cpus_per_worker: int = 1, backend: str = "gloo", timeout_s: int = NCCL_TIMEOUT_S) -> Type[_TorchTrainable]: """Creates a class that executes distributed training. Similar to running `torch.distributed.launch`. Note that you typically should not instantiate the object created. Args: func (callable): This function is a Tune trainable function. This function must have 2 args in the signature, and the latter arg must contain `checkpoint_dir`. For example: `func(config, checkpoint_dir=None)`. use_gpu (bool): Sets resource allocation for workers to 1 GPU if true. Also automatically sets CUDA_VISIBLE_DEVICES for each training worker. num_workers (int): Number of training workers to include in world. num_cpus_per_worker (int): Number of CPU resources to reserve per training worker. backend (str): One of "gloo", "nccl". timeout_s (float): Seconds before the torch process group times out. Useful when machines are unreliable. Defaults to 60 seconds. Returns: type(Trainable): A trainable class object that can be passed to Tune. Resources are automatically set within the object, so users do not need to set `resources_per_trainable`. Example: .. code-block:: python trainable_cls = DistributedTrainableCreator( train_func, num_workers=2) analysis = tune.run(trainable_cls) """ detect_checkpoint_function(func, abort=True) class WrappedDistributedTorchTrainable(_TorchTrainable): _function = func _num_workers = num_workers _use_gpu = use_gpu _num_cpus_per_worker = num_cpus_per_worker @classmethod def default_process_group_parameters(self) -> Dict: return dict(timeout=timedelta(timeout_s), backend=backend) @classmethod def default_resource_request(cls, config: Dict) -> Resources: num_workers_ = int(config.get("num_workers", num_workers)) num_cpus = int( config.get("num_cpus_per_worker", num_cpus_per_worker)) use_gpu_ = config.get("use_gpu", use_gpu) return Resources( cpu=0, gpu=0, extra_cpu=num_cpus * num_workers_, extra_gpu=num_workers_ if use_gpu_ else 0) return WrappedDistributedTorchTrainable
def __init__(self, name, run, stop=None, time_budget_s=None, config=None, resources_per_trial=None, num_samples=1, local_dir=None, upload_dir=None, trial_name_creator=None, trial_dirname_creator=None, loggers=None, log_to_file=False, sync_to_driver=None, checkpoint_freq=0, checkpoint_at_end=False, sync_on_checkpoint=True, keep_checkpoints_num=None, checkpoint_score_attr=None, export_formats=None, max_failures=0, restore=None): config = config or {} if callable(run) and detect_checkpoint_function(run): if checkpoint_at_end: raise ValueError("'checkpoint_at_end' cannot be used with a " "checkpointable function. You can specify " "and register checkpoints within " "your trainable function.") if checkpoint_freq: raise ValueError( "'checkpoint_freq' cannot be used with a " "checkpointable function. You can specify checkpoints " "within your trainable function.") self._run_identifier = Experiment.register_if_needed(run) self.name = name or self._run_identifier if upload_dir: self.remote_checkpoint_dir = os.path.join(upload_dir, self.name) else: self.remote_checkpoint_dir = None self._stopper = None stopping_criteria = {} if not stop: pass elif isinstance(stop, dict): stopping_criteria = stop elif callable(stop): if FunctionStopper.is_valid_function(stop): self._stopper = FunctionStopper(stop) elif issubclass(type(stop), Stopper): self._stopper = stop else: raise ValueError("Provided stop object must be either a dict, " "a function, or a subclass of " "`ray.tune.Stopper`.") else: raise ValueError("Invalid stop criteria: {}. Must be a " "callable or dict".format(stop)) if time_budget_s: if self._stopper: self._stopper = CombinedStopper(self._stopper, TimeoutStopper(time_budget_s)) else: self._stopper = TimeoutStopper(time_budget_s) _raise_on_durable(self._run_identifier, sync_to_driver, upload_dir) stdout_file, stderr_file = _validate_log_to_file(log_to_file) spec = { "run": self._run_identifier, "stop": stopping_criteria, "config": config, "resources_per_trial": resources_per_trial, "num_samples": num_samples, "local_dir": os.path.abspath( os.path.expanduser(local_dir or DEFAULT_RESULTS_DIR)), "upload_dir": upload_dir, "remote_checkpoint_dir": self.remote_checkpoint_dir, "trial_name_creator": trial_name_creator, "trial_dirname_creator": trial_dirname_creator, "loggers": loggers, "log_to_file": (stdout_file, stderr_file), "sync_to_driver": sync_to_driver, "checkpoint_freq": checkpoint_freq, "checkpoint_at_end": checkpoint_at_end, "sync_on_checkpoint": sync_on_checkpoint, "keep_checkpoints_num": keep_checkpoints_num, "checkpoint_score_attr": checkpoint_score_attr, "export_formats": export_formats or [], "max_failures": max_failures, "restore": os.path.abspath(os.path.expanduser(restore)) if restore else None } self.spec = spec
def wrap_function( train_func: Callable[[Any], Any], warn: bool = True, name: Optional[str] = None ): inherit_from = (FunctionTrainable,) if hasattr(train_func, "__mixins__"): inherit_from = train_func.__mixins__ + inherit_from func_args = inspect.getfullargspec(train_func).args use_checkpoint = detect_checkpoint_function(train_func) use_config_single = detect_config_single(train_func) use_reporter = detect_reporter(train_func) if not any([use_checkpoint, use_config_single, use_reporter]): # use_reporter is hidden raise ValueError( "Unknown argument found in the Trainable function. " "The function args must include a 'config' positional " "parameter. Any other args must be 'checkpoint_dir'. " "Found: {}".format(func_args) ) if use_config_single and not use_checkpoint: if log_once("tune_function_checkpoint") and warn: logger.warning( "Function checkpointing is disabled. This may result in " "unexpected behavior when using checkpointing features or " "certain schedulers. To enable, set the train function " "arguments to be `func(config, checkpoint_dir=None)`." ) class ImplicitFunc(*inherit_from): _name = name or ( train_func.__name__ if hasattr(train_func, "__name__") else "func" ) def __repr__(self): return self._name def _trainable_func(self, config, reporter, checkpoint_dir): if not use_checkpoint and not use_reporter: fn = partial(train_func, config) elif use_checkpoint: fn = partial(train_func, config, checkpoint_dir=checkpoint_dir) else: fn = partial(train_func, config, reporter) def handle_output(output): if not output: return elif isinstance(output, dict): reporter(**output) elif isinstance(output, Number): reporter(_metric=output) else: raise ValueError( "Invalid return or yield value. Either return/yield " "a single number or a dictionary object in your " "trainable function." ) output = None if inspect.isgeneratorfunction(train_func): for output in fn(): handle_output(output) else: output = fn() handle_output(output) # If train_func returns, we need to notify the main event loop # of the last result while avoiding double logging. This is done # with the keyword RESULT_DUPLICATE -- see tune/trial_runner.py. reporter(**{RESULT_DUPLICATE: True}) return output return ImplicitFunc
def DistributedTrainableCreator( func: Callable[[Dict, Optional[str]], Any], num_workers: int = 1, num_cpus_per_worker: int = 1, num_gpus_per_worker: int = 0, num_workers_per_host: Optional[int] = None, backend: str = "gloo", timeout_s: int = NCCL_TIMEOUT_S, ) -> Type[_TorchTrainable]: """Creates a class that executes distributed training. Similar to running `torch.distributed.launch`. Note that you typically should not instantiate the object created. Args: func: This function is a Tune trainable function. This function must have 2 args in the signature, and the latter arg must contain `checkpoint_dir`. For example: `func(config, checkpoint_dir=None)`. num_workers: Number of training workers to include in world. num_cpus_per_worker: Number of CPU resources to reserve per training worker. num_gpus_per_worker: Number of GPU resources to reserve per training worker. num_workers_per_host: Optional[int]: Number of workers to colocate per host. backend: One of "gloo", "nccl". timeout_s: Seconds before the torch process group times out. Useful when machines are unreliable. Defaults to 1800 seconds. This value is also reused for triggering placement timeouts if forcing colocation. Returns: type(Trainable): A trainable class object that can be passed to Tune. Resources are automatically set within the object, so users do not need to set `resources_per_trainable`. Example: .. code-block:: python trainable_cls = DistributedTrainableCreator( train_func, num_workers=2) analysis = tune.run(trainable_cls) """ warnings.warn( "Ray Tune's `DistributedTrainableCreator` will be deprecated in Ray " "2.0, and will be replaced by Ray AI Runtime (Ray AIR). Ray AIR (" "https://docs.ray.io/en/latest/ray-air/getting-started.html) will " "provide greater functionality than `DistributedTrainableCreator`, " "and with a more flexible and easy-to-use API.", PendingDeprecationWarning, stacklevel=2, ) detect_checkpoint_function(func, abort=True) if num_workers_per_host: if num_workers % num_workers_per_host: raise ValueError( "`num_workers` must be an integer multiple of workers_per_node." ) class WrappedDistributedTorchTrainable(_TorchTrainable): _function = func _num_workers = num_workers _num_cpus_per_worker = num_cpus_per_worker _num_gpus_per_worker = num_gpus_per_worker _num_workers_per_host = num_workers_per_host _timeout_s = timeout_s @classmethod def default_process_group_parameters(self) -> Dict: return dict(timeout=timedelta(seconds=timeout_s), backend=backend) @classmethod def default_resource_request(cls, config: Dict) -> PlacementGroupFactory: return PlacementGroupFactory( [{}] + [{"CPU": cls._num_cpus_per_worker, "GPU": cls._num_gpus_per_worker}] * num_workers ) return WrappedDistributedTorchTrainable
def __init__(self, name, run, stop=None, time_budget_s=None, config=None, resources_per_trial=None, num_samples=1, local_dir=None, upload_dir=None, trial_name_creator=None, trial_dirname_creator=None, log_to_file=False, sync_to_driver=None, sync_to_cloud=None, checkpoint_freq=0, checkpoint_at_end=False, sync_on_checkpoint=True, keep_checkpoints_num=None, checkpoint_score_attr=None, export_formats=None, max_failures=0, restore=None): config = config or {} if callable(run) and not inspect.isclass(run) and \ detect_checkpoint_function(run): if checkpoint_at_end: raise ValueError("'checkpoint_at_end' cannot be used with a " "checkpointable function. You can specify " "and register checkpoints within " "your trainable function.") if checkpoint_freq: raise ValueError( "'checkpoint_freq' cannot be used with a " "checkpointable function. You can specify checkpoints " "within your trainable function.") self._run_identifier = Experiment.register_if_needed(run) self.name = name or self._run_identifier # If the name has been set explicitly, we don't want to create # dated directories. The same is true for string run identifiers. if int(os.environ.get("TUNE_DISABLE_DATED_SUBDIR", 0)) == 1 or name \ or isinstance(run, str): self.dir_name = self.name else: self.dir_name = "{}_{}".format(self.name, date_str()) if upload_dir: self.remote_checkpoint_dir = os.path.join(upload_dir, self.dir_name) else: self.remote_checkpoint_dir = None self._stopper = None stopping_criteria = {} if not stop: pass elif isinstance(stop, list): if any(not isinstance(s, Stopper) for s in stop): raise ValueError( "If you pass a list as the `stop` argument to " "`tune.run()`, each element must be an instance of " "`tune.stopper.Stopper`.") self._stopper = CombinedStopper(*stop) elif isinstance(stop, dict): stopping_criteria = stop elif callable(stop): if FunctionStopper.is_valid_function(stop): self._stopper = FunctionStopper(stop) elif issubclass(type(stop), Stopper): self._stopper = stop else: raise ValueError("Provided stop object must be either a dict, " "a function, or a subclass of " "`ray.tune.Stopper`.") else: raise ValueError("Invalid stop criteria: {}. Must be a " "callable or dict".format(stop)) if time_budget_s: if self._stopper: self._stopper = CombinedStopper(self._stopper, TimeoutStopper(time_budget_s)) else: self._stopper = TimeoutStopper(time_budget_s) _raise_on_durable(self.is_durable_trainable, sync_to_driver, upload_dir) stdout_file, stderr_file = _validate_log_to_file(log_to_file) spec = { "run": self._run_identifier, "stop": stopping_criteria, "config": config, "resources_per_trial": resources_per_trial, "num_samples": num_samples, "local_dir": os.path.abspath( os.path.expanduser(local_dir or DEFAULT_RESULTS_DIR)), "upload_dir": upload_dir, "remote_checkpoint_dir": self.remote_checkpoint_dir, "trial_name_creator": trial_name_creator, "trial_dirname_creator": trial_dirname_creator, "log_to_file": (stdout_file, stderr_file), "sync_to_driver": sync_to_driver, "sync_to_cloud": sync_to_cloud, "checkpoint_freq": checkpoint_freq, "checkpoint_at_end": checkpoint_at_end, "sync_on_checkpoint": sync_on_checkpoint, "keep_checkpoints_num": keep_checkpoints_num, "checkpoint_score_attr": checkpoint_score_attr, "export_formats": export_formats or [], "max_failures": max_failures, "restore": os.path.abspath(os.path.expanduser(restore)) if restore else None } self.spec = spec
def __init__( self, name, run, stop=None, time_budget_s=None, config=None, resources_per_trial=None, num_samples=1, local_dir=None, _experiment_checkpoint_dir: Optional[str] = None, sync_config=None, trial_name_creator=None, trial_dirname_creator=None, log_to_file=False, checkpoint_freq=0, checkpoint_at_end=False, keep_checkpoints_num=None, checkpoint_score_attr=None, export_formats=None, max_failures=0, restore=None, ): local_dir = _get_local_dir_with_expand_user(local_dir) # `_experiment_checkpoint_dir` is for internal use only for better # support of Tuner API. # If set, it should be a subpath under `local_dir`. Also deduce `dir_name`. self._experiment_checkpoint_dir = _experiment_checkpoint_dir if _experiment_checkpoint_dir: experiment_checkpoint_dir_path = Path(_experiment_checkpoint_dir) local_dir_path = Path(local_dir) assert local_dir_path in experiment_checkpoint_dir_path.parents # `dir_name` is set by `_experiment_checkpoint_dir` indirectly. self.dir_name = os.path.relpath(_experiment_checkpoint_dir, local_dir) config = config or {} sync_config = sync_config or SyncConfig() if ( callable(run) and not inspect.isclass(run) and detect_checkpoint_function(run) ): if checkpoint_at_end: raise ValueError( "'checkpoint_at_end' cannot be used with a " "checkpointable function. You can specify " "and register checkpoints within " "your trainable function." ) if checkpoint_freq: raise ValueError( "'checkpoint_freq' cannot be used with a " "checkpointable function. You can specify checkpoints " "within your trainable function." ) try: self._run_identifier = Experiment.register_if_needed(run) except grpc.RpcError as e: if e.code() == grpc.StatusCode.RESOURCE_EXHAUSTED: raise TuneError( f"The Trainable/training function is too large for grpc resource " f"limit. Check that its definition is not implicitly capturing a " f"large array or other object in scope. " f"Tip: use tune.with_parameters() to put large objects " f"in the Ray object store. \n" f"Original exception: {traceback.format_exc()}" ) else: raise e self.name = name or self._run_identifier if not _experiment_checkpoint_dir: self.dir_name = _get_dir_name(run, name, self.name) assert self.dir_name if sync_config.upload_dir: self.remote_checkpoint_dir = os.path.join( sync_config.upload_dir, self.dir_name ) else: self.remote_checkpoint_dir = None self._stopper = None stopping_criteria = {} if not stop: pass elif isinstance(stop, list): bad_stoppers = [s for s in stop if not isinstance(s, Stopper)] if bad_stoppers: stopper_types = [type(s) for s in stop] raise ValueError( "If you pass a list as the `stop` argument to " "`tune.run()`, each element must be an instance of " f"`tune.stopper.Stopper`. Got {stopper_types}." ) self._stopper = CombinedStopper(*stop) elif isinstance(stop, dict): stopping_criteria = stop elif callable(stop): if FunctionStopper.is_valid_function(stop): self._stopper = FunctionStopper(stop) elif isinstance(stop, Stopper): self._stopper = stop else: raise ValueError( "Provided stop object must be either a dict, " "a function, or a subclass of " f"`ray.tune.Stopper`. Got {type(stop)}." ) else: raise ValueError( f"Invalid stop criteria: {stop}. Must be a " f"callable or dict. Got {type(stop)}." ) if time_budget_s: if self._stopper: self._stopper = CombinedStopper( self._stopper, TimeoutStopper(time_budget_s) ) else: self._stopper = TimeoutStopper(time_budget_s) stdout_file, stderr_file = _validate_log_to_file(log_to_file) spec = { "run": self._run_identifier, "stop": stopping_criteria, "time_budget_s": time_budget_s, "config": config, "resources_per_trial": resources_per_trial, "num_samples": num_samples, "local_dir": local_dir, "sync_config": sync_config, "remote_checkpoint_dir": self.remote_checkpoint_dir, "trial_name_creator": trial_name_creator, "trial_dirname_creator": trial_dirname_creator, "log_to_file": (stdout_file, stderr_file), "checkpoint_freq": checkpoint_freq, "checkpoint_at_end": checkpoint_at_end, "keep_checkpoints_num": keep_checkpoints_num, "checkpoint_score_attr": checkpoint_score_attr, "export_formats": export_formats or [], "max_failures": max_failures, "restore": os.path.abspath(os.path.expanduser(restore)) if restore else None, } self.spec = spec
def DistributedTrainableCreator( func: Callable, num_workers: int = 2, num_gpus_per_worker: int = 0, num_cpus_per_worker: int = 1, num_workers_per_host: Optional[int] = None, timeout_s: int = 15 * 60, ) -> Type[_TensorFlowTrainable]: """Converts TensorFlow MultiWorkerMirror training to be executable by Tune. Requires TensorFlow > 2.0 to work, recommends TensorFlow > 2.2. This function wraps and sets resources for a TF distributed training function to be used with Tune. It generates a TensorFlow Trainable which can be a distributed training job. Note: there is no fault tolerance at the moment. Args: func (Callable[[dict], None]): A training function that takes in a config dict for hyperparameters and should initialize horovod via horovod.init. num_gpus_per_worker (int); Number of GPUs to request from Ray per worker. num_cpus_per_worker (int): Number of CPUs to request from Ray per worker. num_workers (int): Number of hosts that each trial is expected to use. num_workers_per_host (Optional[int]): Number of workers to colocate per host. None if not specified. timeout_s (float): Seconds before triggering placement timeouts if forcing colocation. Default to 15 minutes. Returns: Trainable class that can be passed into `tune.run`. .. versionadded:: 1.1.0 Example: .. code-block:: python # Please refer to full example in tf_distributed_keras_example.py tf_trainable = DistributedTrainableCreator( train_mnist, num_workers=2) tune.run(tf_trainable, num_samples=1) """ detect_checkpoint_function(func, abort=True) if num_workers_per_host: if num_workers % num_workers_per_host: raise ValueError("`num_workers` must be an integer multiple " f"of num_workers_per_host. Got: " f"num_workers: {num_workers}, " f"num_workers_per_host: {num_workers_per_host}") class WrappedDistributedTensorFlowTrainable(_TensorFlowTrainable): _function = func _num_workers = num_workers _num_cpus_per_worker = num_cpus_per_worker _num_workers_per_host = num_workers_per_host _num_gpus_per_worker = num_gpus_per_worker _timeout_s = timeout_s @classmethod def default_resource_request(cls, config: Dict) -> Resources: return PlacementGroupFactory([{}] + [{ "CPU": cls._num_cpus_per_worker, "GPU": cls._num_gpus_per_worker }] * num_workers) return WrappedDistributedTensorFlowTrainable
def wrap_function(train_func: Callable[[Any], Any], warn: bool = True, name: Optional[str] = None) -> Type["FunctionTrainable"]: inherit_from = (FunctionTrainable, ) if hasattr(train_func, "__mixins__"): inherit_from = train_func.__mixins__ + inherit_from func_args = inspect.getfullargspec(train_func).args use_checkpoint = detect_checkpoint_function(train_func) use_config_single = detect_config_single(train_func) use_reporter = detect_reporter(train_func) if not any([use_checkpoint, use_config_single, use_reporter]): # use_reporter is hidden raise ValueError( "Unknown argument found in the Trainable function. " "The function args must include a 'config' positional " "parameter. Any other args must be 'checkpoint_dir'. " "Found: {}".format(func_args)) if use_config_single and not use_checkpoint: if log_once("tune_function_checkpoint") and warn: logger.warning( "Function checkpointing is disabled. This may result in " "unexpected behavior when using checkpointing features or " "certain schedulers. To enable, set the train function " "arguments to be `func(config, checkpoint_dir=None)`.") if use_checkpoint: if log_once("tune_checkpoint_dir_deprecation") and warn: with warnings.catch_warnings(): warnings.simplefilter("always") warning_msg = ( "`checkpoint_dir` in `func(config, checkpoint_dir)` is " "being deprecated. " "To save and load checkpoint in trainable functions, " "please use the `ray.air.session` API:\n\n" "from ray.air import session\n\n" "def train(config):\n" " # ...\n" ' session.report({"metric": metric}, checkpoint=checkpoint)\n\n' "For more information please see " "https://docs.ray.io/en/master/ray-air/key-concepts.html#session\n" ) warnings.warn( warning_msg, DeprecationWarning, ) class ImplicitFunc(*inherit_from): _name = name or (train_func.__name__ if hasattr( train_func, "__name__") else "func") def __repr__(self): return self._name def _trainable_func(self, config, reporter, checkpoint_dir): if not use_checkpoint and not use_reporter: fn = partial(train_func, config) elif use_checkpoint: fn = partial(train_func, config, checkpoint_dir=checkpoint_dir) else: fn = partial(train_func, config, reporter) def handle_output(output): if not output: return elif isinstance(output, dict): reporter(**output) elif isinstance(output, Number): reporter(_metric=output) else: raise ValueError( "Invalid return or yield value. Either return/yield " "a single number or a dictionary object in your " "trainable function.") output = None if inspect.isgeneratorfunction(train_func): for output in fn(): handle_output(output) else: output = fn() handle_output(output) # If train_func returns, we need to notify the main event loop # of the last result while avoiding double logging. This is done # with the keyword RESULT_DUPLICATE -- see tune/trial_runner.py. reporter(**{RESULT_DUPLICATE: True}) return output return ImplicitFunc
def DistributedTrainableCreator( func: Callable[[Dict], None], num_workers: int = 2, num_gpus_per_worker: int = 0, num_cpus_per_worker: int = 1, num_workers_per_host: Optional[int] = None, timeout_s: int = 15 * 60, ) -> Type[_TensorFlowTrainable]: """Converts TensorFlow MultiWorkerMirror training to be executable by Tune. Requires TensorFlow > 2.0 to work, recommends TensorFlow > 2.2. This function wraps and sets resources for a TF distributed training function to be used with Tune. It generates a TensorFlow Trainable which can be a distributed training job. Note: there is no fault tolerance at the moment. Args: func: A training function that takes in a config dict for hyperparameters. num_gpus_per_worker: Number of GPUs to request from Ray per worker. num_cpus_per_worker: Number of CPUs to request from Ray per worker. num_workers: Number of hosts that each trial is expected to use. num_workers_per_host: Number of workers to colocate per host. None if not specified. timeout_s: Seconds before triggering placement timeouts if forcing colocation. Default to 15 minutes. Returns: Trainable class that can be passed into `tune.run`. .. versionadded:: 1.1.0 Example: .. code-block:: python # Please refer to full example in tf_distributed_keras_example.py tf_trainable = DistributedTrainableCreator( train_mnist, num_workers=2) tune.run(tf_trainable, num_samples=1) """ warnings.warn( "Ray Tune's `DistributedTrainableCreator` will be deprecated in Ray " "2.0, and will be replaced by Ray AI Runtime (Ray AIR). Ray AIR (" "https://docs.ray.io/en/latest/ray-air/getting-started.html) will " "provide greater functionality than `DistributedTrainableCreator`, " "and with a more flexible and easy-to-use API.", PendingDeprecationWarning, stacklevel=2, ) detect_checkpoint_function(func, abort=True) if num_workers_per_host: if num_workers % num_workers_per_host: raise ValueError("`num_workers` must be an integer multiple " f"of num_workers_per_host. Got: " f"num_workers: {num_workers}, " f"num_workers_per_host: {num_workers_per_host}") class WrappedDistributedTensorFlowTrainable(_TensorFlowTrainable): _function = func _num_workers = num_workers _num_cpus_per_worker = num_cpus_per_worker _num_workers_per_host = num_workers_per_host _num_gpus_per_worker = num_gpus_per_worker _timeout_s = timeout_s @classmethod def default_resource_request(cls, config: Dict) -> Resources: return PlacementGroupFactory([{}] + [{ "CPU": cls._num_cpus_per_worker, "GPU": cls._num_gpus_per_worker }] * num_workers) return WrappedDistributedTensorFlowTrainable