def write_error_log(self, error_msg): if error_msg and self.logdir: self.num_failures += 1 self.error_file = os.path.join(self.logdir, "error.txt") with open(self.error_file, "a+") as f: f.write("Failure # {} (occurred at {})\n".format( self.num_failures, date_str())) f.write(error_msg + "\n") self.error_msg = error_msg
def _get_dir_name(run, explicit_name: Optional[str], combined_name: str) -> str: # If the name has been set explicitly, we don't want to create # dated directories. The same is true for string run identifiers. if (int(os.environ.get("TUNE_DISABLE_DATED_SUBDIR", 0)) == 1 or explicit_name or isinstance(run, str)): dir_name = combined_name else: dir_name = "{}_{}".format(combined_name, date_str()) return dir_name
def write_error_log(self, exc: Optional[Union[TuneError, RayTaskError]] = None): if exc and self.logdir: self.num_failures += 1 self.error_file = os.path.join(self.logdir, "error.txt") if exc and isinstance(exc, RayTaskError): # Piping through the actual error to result grid. self.pickled_error_file = os.path.join(self.logdir, "error.pkl") with open(self.pickled_error_file, "wb") as f: cloudpickle.dump(exc, f) with open(self.error_file, "a+") as f: f.write("Failure # {} (occurred at {})\n".format( self.num_failures, date_str())) f.write(str(exc) + "\n") self.invalidate_json_state()
def __init__(self, name, run, stop=None, time_budget_s=None, config=None, resources_per_trial=None, num_samples=1, local_dir=None, upload_dir=None, trial_name_creator=None, trial_dirname_creator=None, loggers=None, log_to_file=False, sync_to_driver=None, checkpoint_freq=0, checkpoint_at_end=False, sync_on_checkpoint=True, keep_checkpoints_num=None, checkpoint_score_attr=None, export_formats=None, max_failures=0, restore=None): if loggers is not None: # Most users won't run into this as `tune.run()` does not pass # the argument anymore. However, we will want to inform users # if they instantiate their `Experiment` objects themselves. raise ValueError( "Passing `loggers` to an `Experiment` is deprecated. Use " "an `ExperimentLogger` callback instead, e.g. by passing the " "`Logger` classes to `tune.logger.LegacyExperimentLogger` and " "passing this as part of the `callback` parameter to " "`tune.run()`.") config = config or {} if callable(run) and detect_checkpoint_function(run): if checkpoint_at_end: raise ValueError("'checkpoint_at_end' cannot be used with a " "checkpointable function. You can specify " "and register checkpoints within " "your trainable function.") if checkpoint_freq: raise ValueError( "'checkpoint_freq' cannot be used with a " "checkpointable function. You can specify checkpoints " "within your trainable function.") self._run_identifier = Experiment.register_if_needed(run) self.name = name or self._run_identifier # If the name has been set explicitly, we don't want to create # dated directories. The same is true for string run identifiers. if int(os.environ.get("TUNE_DISABLE_DATED_SUBDIR", 0)) == 1 or name \ or isinstance(run, str): self.dir_name = self.name else: self.dir_name = "{}_{}".format(self.name, date_str()) if upload_dir: self.remote_checkpoint_dir = os.path.join(upload_dir, self.dir_name) else: self.remote_checkpoint_dir = None self._stopper = None stopping_criteria = {} if not stop: pass elif isinstance(stop, dict): stopping_criteria = stop elif callable(stop): if FunctionStopper.is_valid_function(stop): self._stopper = FunctionStopper(stop) elif issubclass(type(stop), Stopper): self._stopper = stop else: raise ValueError("Provided stop object must be either a dict, " "a function, or a subclass of " "`ray.tune.Stopper`.") else: raise ValueError("Invalid stop criteria: {}. Must be a " "callable or dict".format(stop)) if time_budget_s: if self._stopper: self._stopper = CombinedStopper(self._stopper, TimeoutStopper(time_budget_s)) else: self._stopper = TimeoutStopper(time_budget_s) _raise_on_durable(self._run_identifier, sync_to_driver, upload_dir) stdout_file, stderr_file = _validate_log_to_file(log_to_file) spec = { "run": self._run_identifier, "stop": stopping_criteria, "config": config, "resources_per_trial": resources_per_trial, "num_samples": num_samples, "local_dir": os.path.abspath( os.path.expanduser(local_dir or DEFAULT_RESULTS_DIR)), "upload_dir": upload_dir, "remote_checkpoint_dir": self.remote_checkpoint_dir, "trial_name_creator": trial_name_creator, "trial_dirname_creator": trial_dirname_creator, "loggers": loggers, "log_to_file": (stdout_file, stderr_file), "sync_to_driver": sync_to_driver, "checkpoint_freq": checkpoint_freq, "checkpoint_at_end": checkpoint_at_end, "sync_on_checkpoint": sync_on_checkpoint, "keep_checkpoints_num": keep_checkpoints_num, "checkpoint_score_attr": checkpoint_score_attr, "export_formats": export_formats or [], "max_failures": max_failures, "restore": os.path.abspath(os.path.expanduser(restore)) if restore else None } self.spec = spec
def __init__(self, name, run, stop=None, time_budget_s=None, config=None, resources_per_trial=None, num_samples=1, local_dir=None, upload_dir=None, trial_name_creator=None, trial_dirname_creator=None, log_to_file=False, sync_to_driver=None, sync_to_cloud=None, checkpoint_freq=0, checkpoint_at_end=False, sync_on_checkpoint=True, keep_checkpoints_num=None, checkpoint_score_attr=None, export_formats=None, max_failures=0, restore=None): config = config or {} if callable(run) and not inspect.isclass(run) and \ detect_checkpoint_function(run): if checkpoint_at_end: raise ValueError("'checkpoint_at_end' cannot be used with a " "checkpointable function. You can specify " "and register checkpoints within " "your trainable function.") if checkpoint_freq: raise ValueError( "'checkpoint_freq' cannot be used with a " "checkpointable function. You can specify checkpoints " "within your trainable function.") self._run_identifier = Experiment.register_if_needed(run) self.name = name or self._run_identifier # If the name has been set explicitly, we don't want to create # dated directories. The same is true for string run identifiers. if int(os.environ.get("TUNE_DISABLE_DATED_SUBDIR", 0)) == 1 or name \ or isinstance(run, str): self.dir_name = self.name else: self.dir_name = "{}_{}".format(self.name, date_str()) if upload_dir: self.remote_checkpoint_dir = os.path.join(upload_dir, self.dir_name) else: self.remote_checkpoint_dir = None self._stopper = None stopping_criteria = {} if not stop: pass elif isinstance(stop, list): if any(not isinstance(s, Stopper) for s in stop): raise ValueError( "If you pass a list as the `stop` argument to " "`tune.run()`, each element must be an instance of " "`tune.stopper.Stopper`.") self._stopper = CombinedStopper(*stop) elif isinstance(stop, dict): stopping_criteria = stop elif callable(stop): if FunctionStopper.is_valid_function(stop): self._stopper = FunctionStopper(stop) elif issubclass(type(stop), Stopper): self._stopper = stop else: raise ValueError("Provided stop object must be either a dict, " "a function, or a subclass of " "`ray.tune.Stopper`.") else: raise ValueError("Invalid stop criteria: {}. Must be a " "callable or dict".format(stop)) if time_budget_s: if self._stopper: self._stopper = CombinedStopper(self._stopper, TimeoutStopper(time_budget_s)) else: self._stopper = TimeoutStopper(time_budget_s) _raise_on_durable(self.is_durable_trainable, sync_to_driver, upload_dir) stdout_file, stderr_file = _validate_log_to_file(log_to_file) spec = { "run": self._run_identifier, "stop": stopping_criteria, "config": config, "resources_per_trial": resources_per_trial, "num_samples": num_samples, "local_dir": os.path.abspath( os.path.expanduser(local_dir or DEFAULT_RESULTS_DIR)), "upload_dir": upload_dir, "remote_checkpoint_dir": self.remote_checkpoint_dir, "trial_name_creator": trial_name_creator, "trial_dirname_creator": trial_dirname_creator, "log_to_file": (stdout_file, stderr_file), "sync_to_driver": sync_to_driver, "sync_to_cloud": sync_to_cloud, "checkpoint_freq": checkpoint_freq, "checkpoint_at_end": checkpoint_at_end, "sync_on_checkpoint": sync_on_checkpoint, "keep_checkpoints_num": keep_checkpoints_num, "checkpoint_score_attr": checkpoint_score_attr, "export_formats": export_formats or [], "max_failures": max_failures, "restore": os.path.abspath(os.path.expanduser(restore)) if restore else None } self.spec = spec