def _get_run_link(self, tracking_uri, run_id): # if using the default Databricks tracking URI and in a notebook, we can automatically # figure out the run-link. if is_databricks_default_tracking_uri(tracking_uri) and ( is_in_databricks_notebook() or is_in_databricks_job()): # use DBUtils to determine workspace information. workspace_host, workspace_id = get_workspace_info_from_dbutils() else: # in this scenario, we're not able to automatically extract the workspace ID # to proceed, and users will need to pass in a databricks profile with the scheme: # databricks://scope:prefix and store the host and workspace-ID as a secret in the # Databricks Secret Manager with scope=<scope> and key=<prefix>-workspaceid. workspace_host, workspace_id = get_workspace_info_from_databricks_secrets( tracking_uri) if not workspace_id: print( "No workspace ID specified; if your Databricks workspaces share the same" " host URL, you may want to specify the workspace ID (along with the host" " information in the secret manager) for run lineage tracking. For more" " details on how to specify this information in the secret manager," " please refer to the model registry documentation.") # retrieve experiment ID of the run for the URL experiment_id = self.get_run(run_id).info.experiment_id if workspace_host and run_id and experiment_id: return construct_run_url(workspace_host, experiment_id, run_id, workspace_id)
def _get_experiment_id(): # TODO: Replace with None for 1.0, leaving for 0.9.1 release backcompat with existing servers deprecated_default_exp_id = "0" return (_active_experiment_id or _get_experiment_id_from_env() or (is_in_databricks_notebook() and get_notebook_id())) or deprecated_default_exp_id
def test_no_throw(): """ Outside of Databricks the databricks_utils methods should never throw and should only return None. """ assert not databricks_utils.is_in_databricks_notebook() assert not databricks_utils.is_in_databricks_job() assert not databricks_utils.is_dbfs_fuse_available()
def create_model_version(self, name, source, run_id, tags=None, run_link=None): """ Create a new model version from given source or run ID. :param name: Name ID for containing registered model. :param source: Source path where the MLflow model is stored. :param run_id: Run ID from MLflow tracking server that generated the model :param tags: A dictionary of key-value pairs that are converted into :py:class:`mlflow.entities.model_registry.ModelVersionTag` objects. :param run_link: Link to the run from an MLflow tracking server that generated this model. :return: Single :py:class:`mlflow.entities.model_registry.ModelVersion` object created by backend. """ tracking_uri = self._tracking_client.tracking_uri # for Databricks backends, we support automatically populating the run link field if is_databricks_uri( tracking_uri ) and tracking_uri != self._registry_uri and not run_link: # if using the default Databricks tracking URI and in a notebook, we can automatically # figure out the run-link. if is_databricks_default_tracking_uri( tracking_uri) and is_in_databricks_notebook(): # use DBUtils to determine workspace information. workspace_host, workspace_id = get_workspace_info_from_dbutils( ) else: # in this scenario, we're not able to automatically extract the workspace ID # to proceed, and users will need to pass in a databricks profile with the scheme: # databricks://scope/prefix and store the host and workspace-ID as a secret in the # Databricks Secret Manager with scope=<scope> and key=<prefix>-workspaceid. workspace_host, workspace_id = \ get_workspace_info_from_databricks_secrets(tracking_uri) if not workspace_id: print( "No workspace ID specified; if your Databricks workspaces share the same" " host URL, you may want to specify the workspace ID (along with the host" " information in the secret manager) for run lineage tracking. For more" " details on how to specify this information in the secret manager," " please refer to the model registry documentation.") # retrieve experiment ID of the run for the URL experiment_id = self.get_run(run_id).info.experiment_id if workspace_host and run_id and experiment_id: run_link = construct_run_url(workspace_host, experiment_id, run_id, workspace_id) return self._get_registry_client().create_model_version( name=name, source=source, run_id=run_id, tags=tags, run_link=run_link)
def request_headers(self): request_headers = {} if databricks_utils.is_in_databricks_notebook(): request_headers["notebook_id"] = databricks_utils.get_notebook_id() if databricks_utils.is_in_databricks_job(): request_headers["job_id"] = databricks_utils.get_job_id() request_headers["job_run_id"] = databricks_utils.get_job_run_id() request_headers["job_type"] = databricks_utils.get_job_type() if databricks_utils.is_in_cluster(): request_headers["cluster_id"] = databricks_utils.get_cluster_id() return request_headers
def is_flavor_supported_for_associated_package_versions(flavor_name): """ :return: True if the specified flavor is supported for the currently-installed versions of its associated packages """ module_name, module_key = FLAVOR_TO_MODULE_NAME_AND_VERSION_INFO_KEY[ flavor_name] actual_version = importlib.import_module(module_name).__version__ # In Databricks, treat 'pyspark 3.x.y.dev0' as 'pyspark 3.x.y' if module_name == "pyspark" and (is_in_databricks_notebook() or is_in_databricks_job()): actual_version = _strip_dev_version_suffix(actual_version) if _violates_pep_440(actual_version) or _is_pre_or_dev_release( actual_version): return False min_version, max_version, _ = get_min_max_version_and_pip_release( module_key) return _check_version_in_range(actual_version, min_version, max_version)
def start_run(run_uuid=None, experiment_id=None, source_name=None, source_version=None, entry_point_name=None, source_type=None, run_name=None): """ Start a new MLflow run, setting it as the active run under which metrics and parameters will be logged. The return value can be used as a context manager within a ``with`` block; otherwise, you must call ``end_run()`` to terminate the current run. If you pass a ``run_uuid`` or the ``MLFLOW_RUN_ID`` environment variable is set, ``start_run`` attempts to resume a run with the specified run ID and other parameters are ignored. ``run_uuid`` takes precedence over ``MLFLOW_RUN_ID``. :param run_uuid: If specified, get the run with the specified UUID and log parameters and metrics under that run. The run's end time is unset and its status is set to running, but the run's other attributes (``source_version``, ``source_type``, etc.) are not changed. :param experiment_id: ID of the experiment under which to create the current run (applicable only when ``run_uuid`` is not specified). If ``experiment_id`` argument is unspecified, will look for valid experiment in the following order: activated using ``set_experiment``, ``MLFLOW_EXPERIMENT_ID`` env variable, or the default experiment. :param source_name: Name of the source file or URI of the project to be associated with the run. If none provided defaults to the current file. :param source_version: Optional Git commit hash to associate with the run. :param entry_point_name: Optional name of the entry point for the current run. :param source_type: Integer :py:class:`mlflow.entities.SourceType` describing the type of the run ("local", "project", etc.). Defaults to :py:class:`mlflow.entities.SourceType.LOCAL` ("local"). :param run_name: Name of new run. Used only when ``run_uuid`` is unspecified. :return: :py:class:`mlflow.ActiveRun` object that acts as a context manager wrapping the run's state. """ global _active_run if _active_run: raise Exception( "Run with UUID %s is already active, unable to start nested " "run" % _active_run.info.run_uuid) existing_run_uuid = run_uuid or os.environ.get(_RUN_ID_ENV_VAR, None) if existing_run_uuid: _validate_run_id(existing_run_uuid) active_run_obj = MlflowClient().get_run(existing_run_uuid) else: exp_id_for_run = experiment_id or _get_experiment_id() if is_in_databricks_notebook(): databricks_tags = {} notebook_id = get_notebook_id() notebook_path = get_notebook_path() webapp_url = get_webapp_url() if notebook_id is not None: databricks_tags[MLFLOW_DATABRICKS_NOTEBOOK_ID] = notebook_id if notebook_path is not None: databricks_tags[ MLFLOW_DATABRICKS_NOTEBOOK_PATH] = notebook_path if webapp_url is not None: databricks_tags[MLFLOW_DATABRICKS_WEBAPP_URL] = webapp_url active_run_obj = MlflowClient().create_run( experiment_id=exp_id_for_run, run_name=run_name, source_name=notebook_path, source_version=source_version or _get_source_version(), entry_point_name=entry_point_name, source_type=SourceType.NOTEBOOK, tags=databricks_tags) else: active_run_obj = MlflowClient().create_run( experiment_id=exp_id_for_run, run_name=run_name, source_name=source_name or _get_source_name(), source_version=source_version or _get_source_version(), entry_point_name=entry_point_name, source_type=source_type or _get_source_type()) _active_run = ActiveRun(active_run_obj) return _active_run
def in_context(self): return (databricks_utils.is_in_cluster() or databricks_utils.is_in_databricks_notebook() or databricks_utils.is_in_databricks_job())
def in_context(self): return databricks_utils.is_in_databricks_notebook()
def _get_experiment_id(): return int(_active_experiment_id or env.get_env(_EXPERIMENT_ID_ENV_VAR) or (env.get_env(_AUTODETECT_EXPERIMENT) and is_in_databricks_notebook() and get_notebook_id()) or Experiment.DEFAULT_EXPERIMENT_ID)
def _get_experiment_id(): return int(_active_experiment_id or _get_experiment_id_from_env() or (is_in_databricks_notebook() and get_notebook_id()) or Experiment.DEFAULT_EXPERIMENT_ID)