def __init__( self, tracking_uri: Optional[str] = None, registry_uri: Optional[str] = None, experiment_name: Optional[str] = None, tags: Optional[Dict] = None, save_artifact: bool = False, ): self.tracking_uri = tracking_uri self.registry_uri = registry_uri self.experiment_name = experiment_name self.tags = tags self.should_save_artifact = save_artifact self.mlflow_util = MLflowLoggerUtil() if ray.util.client.ray.is_connected(): logger.warning( "When using MLflowLoggerCallback with Ray Client, " "it is recommended to use a remote tracking " "server. If you are using a MLflow tracking server " "backed by the local filesystem, then it must be " "setup on the server side and not on the client " "side." )
def __init__( self, tracking_uri: Optional[str] = None, registry_uri: Optional[str] = None, experiment_id: Optional[str] = None, experiment_name: Optional[str] = None, tags: Optional[Dict] = None, save_artifact: bool = False, logdir: Optional[str] = None, worker_to_log: int = 0, ): warnings.warn( _deprecation_msg, DeprecationWarning, ) self._logdir_manager = _TrainCallbackLogdirManager(logdir=logdir) self.results_preprocessor = IndexedResultsPreprocessor( indices=worker_to_log) self.tracking_uri = tracking_uri self.registry_uri = registry_uri self.experiment_id = experiment_id self.experiment_name = experiment_name self.tags = tags self.save_artifact = save_artifact self.mlflow_util = MLflowLoggerUtil()
def setUp(self): self.dirpath = tempfile.mkdtemp() import mlflow mlflow.set_tracking_uri(self.dirpath) mlflow.create_experiment(name="existing_experiment") self.mlflow_util = MLflowLoggerUtil() self.tracking_uri = mlflow.get_tracking_uri()
def __init__(self, config: Dict, *args, **kwargs): self.mlflow_util = MLflowLoggerUtil() if not isinstance(self, Trainable): raise ValueError( "The `MLflowTrainableMixin` can only be used as a mixin " "for `tune.Trainable` classes. Please make sure your " "class inherits from both. For example: " "`class YourTrainable(MLflowTrainableMixin)`." ) super().__init__(config, *args, **kwargs) _config = config.copy() try: mlflow_config = _config.pop("mlflow").copy() except KeyError as e: raise ValueError( "MLflow mixin specified but no configuration has been passed. " "Make sure to include a `mlflow` key in your `config` dict " "containing at least a `tracking_uri` and either " "`experiment_name` or `experiment_id` specification." ) from e tracking_uri = mlflow_config.pop("tracking_uri", None) if tracking_uri is None: raise ValueError( "MLflow mixin specified but no " "tracking_uri has been " "passed in. Make sure to include a `mlflow` " "key in your `config` dict containing at " "least a `tracking_uri`" ) # Set the tracking token if one is passed in. tracking_token = mlflow_config.pop("token", None) experiment_id = mlflow_config.pop("experiment_id", None) experiment_name = mlflow_config.pop("experiment_name", None) # This initialization happens in each of the Trainables/workers. # So we have to set `create_experiment_if_not_exists` to False. # Otherwise there might be race conditions when each worker tries to # create the same experiment. # For the mixin, the experiment must be created beforehand. self.mlflow_util.setup_mlflow( tracking_uri=tracking_uri, experiment_id=experiment_id, experiment_name=experiment_name, tracking_token=tracking_token, create_experiment_if_not_exists=False, ) run_name = self.trial_name + "_" + self.trial_id run_name = run_name.replace("/", "_") self.mlflow_util.start_run(set_active=True, run_name=run_name)
def __init__(self, tracking_uri: Optional[str] = None, registry_uri: Optional[str] = None, experiment_id: Optional[str] = None, experiment_name: Optional[str] = None, tags: Optional[Dict] = None, save_artifact: bool = False, logdir: Optional[str] = None, worker_to_log: int = 0): super().__init__(logdir=logdir, worker_to_log=worker_to_log) self.tracking_uri = tracking_uri self.registry_uri = registry_uri self.experiment_id = experiment_id self.experiment_name = experiment_name self.tags = tags self.save_artifact = save_artifact self.mlflow_util = MLflowLoggerUtil()
class MLflowLoggerCallback(LoggerCallback): """MLflow Logger to automatically log Tune results and config to MLflow. MLflow (https://mlflow.org) Tracking is an open source library for recording and querying experiments. This Ray Tune ``LoggerCallback`` sends information (config parameters, training results & metrics, and artifacts) to MLflow for automatic experiment tracking. Args: tracking_uri (str): The tracking URI for where to manage experiments and runs. This can either be a local file path or a remote server. This arg gets passed directly to mlflow initialization. When using Tune in a multi-node setting, make sure to set this to a remote server and not a local file path. registry_uri (str): The registry URI that gets passed directly to mlflow initialization. experiment_name (str): The experiment name to use for this Tune run. If the experiment with the name already exists with MLflow, it will be reused. If not, a new experiment will be created with that name. tags (Dict): An optional dictionary of string keys and values to set as tags on the run save_artifact (bool): If set to True, automatically save the entire contents of the Tune local_dir as an artifact to the corresponding run in MlFlow. Example: .. code-block:: python from ray.tune.integration.mlflow import MLflowLoggerCallback tags = { "user_name" : "John", "git_commit_hash" : "abc123"} tune.run( train_fn, config={ # define search space here "parameter_1": tune.choice([1, 2, 3]), "parameter_2": tune.choice([4, 5, 6]), }, callbacks=[MLflowLoggerCallback( experiment_name="experiment1", tags=tags, save_artifact=True)]) """ def __init__( self, tracking_uri: Optional[str] = None, registry_uri: Optional[str] = None, experiment_name: Optional[str] = None, tags: Optional[Dict] = None, save_artifact: bool = False, ): self.tracking_uri = tracking_uri self.registry_uri = registry_uri self.experiment_name = experiment_name self.tags = tags self.should_save_artifact = save_artifact self.mlflow_util = MLflowLoggerUtil() if ray.util.client.ray.is_connected(): logger.warning("When using MLflowLoggerCallback with Ray Client, " "it is recommended to use a remote tracking " "server. If you are using a MLflow tracking server " "backed by the local filesystem, then it must be " "setup on the server side and not on the client " "side.") def setup(self, *args, **kwargs): # Setup the mlflow logging util. self.mlflow_util.setup_mlflow(tracking_uri=self.tracking_uri, registry_uri=self.registry_uri, experiment_name=self.experiment_name) if self.tags is None: # Create empty dictionary for tags if not given explicitly self.tags = {} self._trial_runs = {} def log_trial_start(self, trial: "Trial"): # Create run if not already exists. if trial not in self._trial_runs: # Set trial name in tags tags = self.tags.copy() tags["trial_name"] = str(trial) run = self.mlflow_util.start_run(tags=tags, run_name=str(trial)) self._trial_runs[trial] = run.info.run_id run_id = self._trial_runs[trial] # Log the config parameters. config = trial.config self.mlflow_util.log_params(run_id=run_id, params_to_log=config) def log_trial_result(self, iteration: int, trial: "Trial", result: Dict): step = result.get(TIMESTEPS_TOTAL) or result[TRAINING_ITERATION] run_id = self._trial_runs[trial] self.mlflow_util.log_metrics(run_id=run_id, metrics_to_log=result, step=step) def log_trial_end(self, trial: "Trial", failed: bool = False): run_id = self._trial_runs[trial] # Log the artifact if set_artifact is set to True. if self.should_save_artifact: self.mlflow_util.save_artifacts(run_id=run_id, dir=trial.logdir) # Stop the run once trial finishes. status = "FINISHED" if not failed else "FAILED" self.mlflow_util.end_run(run_id=run_id, status=status)
class MLflowLoggerCallback(TrainingCallback): """MLflow Logger to automatically log Train results and config to MLflow. MLflow (https://mlflow.org) Tracking is an open source library for recording and querying experiments. This Ray Train callback sends information (config parameters, training results & metrics, and artifacts) to MLflow for automatic experiment tracking. Args: tracking_uri (Optional[str]): The tracking URI for where to manage experiments and runs. This can either be a local file path or a remote server. If None is passed in, the logdir of the trainer will be used as the tracking URI. This arg gets passed directly to mlflow initialization. registry_uri (Optional[str]): The registry URI that gets passed directly to mlflow initialization. If None is passed in, the logdir of the trainer will be used as the registry URI. experiment_id (Optional[str]): The experiment id of an already existing experiment. If not passed in, experiment_name will be used. experiment_name (Optional[str]): The experiment name to use for this Train run. If the experiment with the name already exists with MLflow, it will be used. If not, a new experiment will be created with this name. At least one of ``experiment_id`` or ``experiment_name`` must be passed in. tags (Optional[Dict]): An optional dictionary of string keys and values to set as tags on the run save_artifact: If set to True, automatically save the entire contents of the Train local_dir as an artifact to the corresponding run in MlFlow. logdir (Optional[str]): Path to directory where the results file should be. If None, will be set by the Trainer. If no tracking uri or registry uri are passed in, the logdir will be used for both. worker_to_log: Worker index to log. By default, will log the worker with index 0. """ def __init__( self, tracking_uri: Optional[str] = None, registry_uri: Optional[str] = None, experiment_id: Optional[str] = None, experiment_name: Optional[str] = None, tags: Optional[Dict] = None, save_artifact: bool = False, logdir: Optional[str] = None, worker_to_log: int = 0, ): warnings.warn( _deprecation_msg, DeprecationWarning, ) self._logdir_manager = _TrainCallbackLogdirManager(logdir=logdir) self.results_preprocessor = IndexedResultsPreprocessor( indices=worker_to_log) self.tracking_uri = tracking_uri self.registry_uri = registry_uri self.experiment_id = experiment_id self.experiment_name = experiment_name self.tags = tags self.save_artifact = save_artifact self.mlflow_util = MLflowLoggerUtil() def start_training(self, logdir: str, config: Dict, **info): self._logdir_manager.setup_logdir(default_logdir=logdir) tracking_uri = self.tracking_uri or os.path.join( str(self.logdir), "mlruns") registry_uri = self.registry_uri or os.path.join( str(self.logdir), "mlruns") self.mlflow_util.setup_mlflow( tracking_uri=tracking_uri, registry_uri=registry_uri, experiment_id=self.experiment_id, experiment_name=self.experiment_name, create_experiment_if_not_exists=True, ) self.mlflow_util.start_run(tags=self.tags, set_active=True) self.mlflow_util.log_params(params_to_log=config) def handle_result(self, results: List[Dict], **info): result = results[0] self.mlflow_util.log_metrics(metrics_to_log=result, step=result[TRAINING_ITERATION]) def finish_training(self, error: bool = False, **info): checkpoint_dir = self.logdir.joinpath(TRAIN_CHECKPOINT_SUBDIR) if self.save_artifact and checkpoint_dir.exists(): self.mlflow_util.save_artifacts(dir=str(checkpoint_dir)) self.mlflow_util.end_run(status="FAILED" if error else "FINISHED") @property def logdir(self) -> Path: return self._logdir_manager.logdir_path
class MLflowTest(unittest.TestCase): def setUp(self): self.dirpath = tempfile.mkdtemp() import mlflow mlflow.set_tracking_uri(self.dirpath) mlflow.create_experiment(name="existing_experiment") self.mlflow_util = MLflowLoggerUtil() self.tracking_uri = mlflow.get_tracking_uri() def tearDown(self): shutil.rmtree(self.dirpath) def test_experiment_id(self): self.mlflow_util.setup_mlflow(tracking_uri=self.tracking_uri, experiment_id="0") assert self.mlflow_util.experiment_id == "0" def test_experiment_id_env_var(self): os.environ["MLFLOW_EXPERIMENT_ID"] = "0" self.mlflow_util.setup_mlflow(tracking_uri=self.tracking_uri) assert self.mlflow_util.experiment_id == "0" del os.environ["MLFLOW_EXPERIMENT_ID"] def test_experiment_name(self): self.mlflow_util.setup_mlflow(tracking_uri=self.tracking_uri, experiment_name="existing_experiment") assert self.mlflow_util.experiment_id == "0" def test_experiment_name_env_var(self): os.environ["MLFLOW_EXPERIMENT_NAME"] = "existing_experiment" self.mlflow_util.setup_mlflow(tracking_uri=self.tracking_uri) assert self.mlflow_util.experiment_id == "0" del os.environ["MLFLOW_EXPERIMENT_NAME"] def test_id_precedence(self): os.environ["MLFLOW_EXPERIMENT_ID"] = "0" self.mlflow_util.setup_mlflow(tracking_uri=self.tracking_uri, experiment_name="new_experiment") assert self.mlflow_util.experiment_id == "0" del os.environ["MLFLOW_EXPERIMENT_ID"] def test_new_experiment(self): self.mlflow_util.setup_mlflow(tracking_uri=self.tracking_uri, experiment_name="new_experiment") assert self.mlflow_util.experiment_id == "1" def test_setup_fail(self): with self.assertRaises(ValueError): self.mlflow_util.setup_mlflow( tracking_uri=self.tracking_uri, experiment_name="new_experiment2", create_experiment_if_not_exists=False) def test_log_params(self): params = {"a": "a"} self.mlflow_util.setup_mlflow(tracking_uri=self.tracking_uri, experiment_name="new_experiment") run = self.mlflow_util.start_run() run_id = run.info.run_id self.mlflow_util.log_params(params_to_log=params, run_id=run_id) run = self.mlflow_util._mlflow.get_run(run_id=run_id) assert run.data.params == params params2 = {"b": "b"} self.mlflow_util.start_run(set_active=True) self.mlflow_util.log_params(params_to_log=params2, run_id=run_id) assert self.mlflow_util._mlflow.get_run(run_id=run_id).data.params == { **params, **params2 } self.mlflow_util.end_run() def test_log_metrics(self): metrics = {"a": 1.0} self.mlflow_util.setup_mlflow(tracking_uri=self.tracking_uri, experiment_name="new_experiment") run = self.mlflow_util.start_run() run_id = run.info.run_id self.mlflow_util.log_metrics(metrics_to_log=metrics, run_id=run_id, step=0) run = self.mlflow_util._mlflow.get_run(run_id=run_id) assert run.data.metrics == metrics metrics2 = {"b": 1.0} self.mlflow_util.start_run(set_active=True) self.mlflow_util.log_metrics(metrics_to_log=metrics2, run_id=run_id, step=0) assert self.mlflow_util._mlflow.get_run( run_id=run_id).data.metrics == { **metrics, **metrics2 } self.mlflow_util.end_run()