Exemplo n.º 1
0
class MLflowLoggerCallback(LoggerCallback):
    """MLflow Logger to automatically log Tune results and config to MLflow.

    MLflow (https://mlflow.org) Tracking is an open source library for
    recording and querying experiments. This Ray Tune ``LoggerCallback``
    sends information (config parameters, training results & metrics,
    and artifacts) to MLflow for automatic experiment tracking.

    Args:
        tracking_uri (str): The tracking URI for where to manage experiments
            and runs. This can either be a local file path or a remote server.
            This arg gets passed directly to mlflow
            initialization. When using Tune in a multi-node setting, make sure
            to set this to a remote server and not a local file path.
        registry_uri (str): The registry URI that gets passed directly to
            mlflow initialization.
        experiment_name (str): The experiment name to use for this Tune run.
            If the experiment with the name already exists with MLflow,
            it will be reused. If not, a new experiment will be created with
            that name.
        tags (Dict):  An optional dictionary of string keys and values to set
            as tags on the run
        save_artifact (bool): If set to True, automatically save the entire
            contents of the Tune local_dir as an artifact to the
            corresponding run in MlFlow.

    Example:

    .. code-block:: python

        from ray.tune.integration.mlflow import MLflowLoggerCallback

        tags = { "user_name" : "John",
                 "git_commit_hash" : "abc123"}

        tune.run(
            train_fn,
            config={
                # define search space here
                "parameter_1": tune.choice([1, 2, 3]),
                "parameter_2": tune.choice([4, 5, 6]),
            },
            callbacks=[MLflowLoggerCallback(
                experiment_name="experiment1",
                tags=tags,
                save_artifact=True)])

    """
    def __init__(
        self,
        tracking_uri: Optional[str] = None,
        registry_uri: Optional[str] = None,
        experiment_name: Optional[str] = None,
        tags: Optional[Dict] = None,
        save_artifact: bool = False,
    ):

        self.tracking_uri = tracking_uri
        self.registry_uri = registry_uri
        self.experiment_name = experiment_name
        self.tags = tags
        self.should_save_artifact = save_artifact

        self.mlflow_util = MLflowLoggerUtil()

        if ray.util.client.ray.is_connected():
            logger.warning("When using MLflowLoggerCallback with Ray Client, "
                           "it is recommended to use a remote tracking "
                           "server. If you are using a MLflow tracking server "
                           "backed by the local filesystem, then it must be "
                           "setup on the server side and not on the client "
                           "side.")

    def setup(self, *args, **kwargs):
        # Setup the mlflow logging util.
        self.mlflow_util.setup_mlflow(tracking_uri=self.tracking_uri,
                                      registry_uri=self.registry_uri,
                                      experiment_name=self.experiment_name)

        if self.tags is None:
            # Create empty dictionary for tags if not given explicitly
            self.tags = {}

        self._trial_runs = {}

    def log_trial_start(self, trial: "Trial"):
        # Create run if not already exists.
        if trial not in self._trial_runs:

            # Set trial name in tags
            tags = self.tags.copy()
            tags["trial_name"] = str(trial)

            run = self.mlflow_util.start_run(tags=tags, run_name=str(trial))
            self._trial_runs[trial] = run.info.run_id

        run_id = self._trial_runs[trial]

        # Log the config parameters.
        config = trial.config
        self.mlflow_util.log_params(run_id=run_id, params_to_log=config)

    def log_trial_result(self, iteration: int, trial: "Trial", result: Dict):
        step = result.get(TIMESTEPS_TOTAL) or result[TRAINING_ITERATION]
        run_id = self._trial_runs[trial]
        self.mlflow_util.log_metrics(run_id=run_id,
                                     metrics_to_log=result,
                                     step=step)

    def log_trial_end(self, trial: "Trial", failed: bool = False):
        run_id = self._trial_runs[trial]

        # Log the artifact if set_artifact is set to True.
        if self.should_save_artifact:
            self.mlflow_util.save_artifacts(run_id=run_id, dir=trial.logdir)

        # Stop the run once trial finishes.
        status = "FINISHED" if not failed else "FAILED"
        self.mlflow_util.end_run(run_id=run_id, status=status)
Exemplo n.º 2
0
class MLflowLoggerCallback(TrainingCallback):
    """MLflow Logger to automatically log Train results and config to MLflow.

    MLflow (https://mlflow.org) Tracking is an open source library for
    recording and querying experiments. This Ray Train callback
    sends information (config parameters, training results & metrics,
    and artifacts) to MLflow for automatic experiment tracking.

    Args:
        tracking_uri (Optional[str]): The tracking URI for where to manage
            experiments and runs. This can either be a local file path or a
            remote server. If None is passed in, the logdir of the trainer
            will be used as the tracking URI.
            This arg gets passed directly to mlflow initialization.
        registry_uri (Optional[str]): The registry URI that gets passed
            directly to mlflow initialization. If None is passed in, the
            logdir of the trainer will be used as the registry URI.
        experiment_id (Optional[str]): The experiment id of an already
            existing experiment. If not
            passed in, experiment_name will be used.
        experiment_name (Optional[str]): The experiment name to use for this
            Train run.
            If the experiment with the name already exists with MLflow,
            it will be used. If not, a new experiment will be created with
            this name. At least one of ``experiment_id`` or
            ``experiment_name`` must be passed in.
        tags (Optional[Dict]):  An optional dictionary of string keys and
            values to set as tags on the run
        save_artifact: If set to True, automatically save the entire
            contents of the Train local_dir as an artifact to the
            corresponding run in MlFlow.
        logdir (Optional[str]): Path to directory where the results file
            should be. If None, will be set by the Trainer. If no tracking
            uri or registry uri are passed in, the logdir will be used for
            both.
        worker_to_log: Worker index to log. By default, will log the
            worker with index 0.
    """
    def __init__(
        self,
        tracking_uri: Optional[str] = None,
        registry_uri: Optional[str] = None,
        experiment_id: Optional[str] = None,
        experiment_name: Optional[str] = None,
        tags: Optional[Dict] = None,
        save_artifact: bool = False,
        logdir: Optional[str] = None,
        worker_to_log: int = 0,
    ):
        warnings.warn(
            _deprecation_msg,
            DeprecationWarning,
        )
        self._logdir_manager = _TrainCallbackLogdirManager(logdir=logdir)
        self.results_preprocessor = IndexedResultsPreprocessor(
            indices=worker_to_log)

        self.tracking_uri = tracking_uri
        self.registry_uri = registry_uri
        self.experiment_id = experiment_id
        self.experiment_name = experiment_name
        self.tags = tags

        self.save_artifact = save_artifact
        self.mlflow_util = MLflowLoggerUtil()

    def start_training(self, logdir: str, config: Dict, **info):
        self._logdir_manager.setup_logdir(default_logdir=logdir)

        tracking_uri = self.tracking_uri or os.path.join(
            str(self.logdir), "mlruns")
        registry_uri = self.registry_uri or os.path.join(
            str(self.logdir), "mlruns")

        self.mlflow_util.setup_mlflow(
            tracking_uri=tracking_uri,
            registry_uri=registry_uri,
            experiment_id=self.experiment_id,
            experiment_name=self.experiment_name,
            create_experiment_if_not_exists=True,
        )

        self.mlflow_util.start_run(tags=self.tags, set_active=True)
        self.mlflow_util.log_params(params_to_log=config)

    def handle_result(self, results: List[Dict], **info):
        result = results[0]

        self.mlflow_util.log_metrics(metrics_to_log=result,
                                     step=result[TRAINING_ITERATION])

    def finish_training(self, error: bool = False, **info):
        checkpoint_dir = self.logdir.joinpath(TRAIN_CHECKPOINT_SUBDIR)
        if self.save_artifact and checkpoint_dir.exists():
            self.mlflow_util.save_artifacts(dir=str(checkpoint_dir))
        self.mlflow_util.end_run(status="FAILED" if error else "FINISHED")

    @property
    def logdir(self) -> Path:
        return self._logdir_manager.logdir_path