예제 #1
0
def get_databricks_runtime():
    if is_in_databricks_runtime():
        spark_session = _get_active_spark_session()
        if spark_session is not None:
            return spark_session.conf.get(
                "spark.databricks.clusterUsageTags.sparkVersion", default=None)
    return None
예제 #2
0
def _read_log_model_allowlist():
    """
    Reads the module allowlist and returns it as a set.
    """
    from mlflow.utils._spark_utils import _get_active_spark_session

    builtin_allowlist_file = resource_filename(__name__,
                                               "log_model_allowlist.txt")
    spark_session = _get_active_spark_session()
    if not spark_session:
        _logger.info(
            "No SparkSession detected. Autologging will log pyspark.ml models contained "
            "in the default allowlist. To specify a custom allowlist, initialize a SparkSession "
            "prior to calling mlflow.pyspark.ml.autolog() and specify the path to your allowlist "
            "file via the spark.mlflow.pysparkml.autolog.logModelAllowlistFile conf."
        )
        return _read_log_model_allowlist_from_file(builtin_allowlist_file)

    allowlist_file = spark_session.sparkContext._conf.get(
        "spark.mlflow.pysparkml.autolog.logModelAllowlistFile", None)
    if allowlist_file:
        try:
            return _read_log_model_allowlist_from_file(allowlist_file)
        except Exception:
            # fallback to built-in allowlist file
            _logger.exception(
                ("Reading from custom log_models allowlist file %s failed, "
                 "fallback to built-in allowlist file."),
                allowlist_file,
            )
            return _read_log_model_allowlist_from_file(builtin_allowlist_file)
    else:
        return _read_log_model_allowlist_from_file(builtin_allowlist_file)
예제 #3
0
def is_in_cluster():
    try:
        spark_session = _get_active_spark_session()
        return (spark_session is not None and spark_session.conf.get(
            "spark.databricks.clusterUsageTags.clusterId") is not None)
    except Exception:
        return False
예제 #4
0
def is_in_cluster():
    try:
        spark_session = _get_active_spark_session()
        return spark_session is not None \
            and spark_session.conf.get("spark.databricks.clusterUsageTags.clusterId") is not None
    except Exception:  # pylint: disable=broad-except
        return False
예제 #5
0
def get_workspace_url():
    try:
        spark_session = _get_active_spark_session()
        if spark_session is not None:
            return spark_session.conf.get("spark.databricks.workspaceUrl")
    except Exception:
        return None
예제 #6
0
def get_nfs_cache_root_dir():
    if is_in_databricks_runtime():
        nfs_enabled = (_get_active_spark_session().conf.get(
            "spark.databricks.mlflow.nfs.enabled", "true").lower() == "true")
        if nfs_enabled:
            nfs_root_dir = "/local_disk0/.ephemeral_nfs"
            # Test whether the NFS directory is writable.
            test_path = os.path.join(nfs_root_dir, uuid.uuid4().hex)
            try:
                os.makedirs(test_path)
                return nfs_root_dir
            except Exception:
                # For databricks cluster enabled Table ACL, we have no permission to access NFS
                # directory, in this case, return None representing NFS is not available.
                return None
            finally:
                shutil.rmtree(test_path, ignore_errors=True)
        else:
            return None
    else:
        return _get_active_spark_session().conf.get("spark.mlflow.nfs.rootDir",
                                                    None)
예제 #7
0
def autolog():
    """Implementation of Spark datasource autologging"""
    global _spark_table_info_listener
    if _get_current_listener() is None:
        active_session = _get_active_spark_session()
        if active_session is None:
            raise MlflowException(
                "No active SparkContext found, refusing to enable Spark datasource "
                "autologging. Please create a SparkSession e.g. via "
                "SparkSession.builder.getOrCreate() (see API docs at "
                "https://spark.apache.org/docs/latest/api/python/"
                "pyspark.sql.html#pyspark.sql.SparkSession) "
                "before attempting to enable autologging"
            )
        # We know SparkContext exists here already, so get it
        sc = SparkContext.getOrCreate()
        if _get_spark_major_version(sc) < 3:
            raise MlflowException("Spark autologging unsupported for Spark versions < 3")
        gw = active_session.sparkContext._gateway
        params = gw.callback_server_parameters
        callback_server_params = CallbackServerParameters(
            address=params.address,
            port=params.port,
            daemonize=True,
            daemonize_connections=True,
            eager_load=params.eager_load,
            ssl_context=params.ssl_context,
            accept_timeout=params.accept_timeout,
            read_timeout=params.read_timeout,
            auth_token=params.auth_token,
        )
        gw.start_callback_server(callback_server_params)

        event_publisher = _get_jvm_event_publisher()
        try:
            event_publisher.init(1)
            _spark_table_info_listener = PythonSubscriber()
            _spark_table_info_listener.register()
        except Exception as e:
            raise MlflowException(
                "Exception while attempting to initialize JVM-side state for "
                "Spark datasource autologging. Please ensure you have the "
                "mlflow-spark JAR attached to your Spark session as described "
                "in http://mlflow.org/docs/latest/tracking.html#"
                "automatic-logging-from-spark-experimental. Exception:\n%s" % e
            )

        # Register context provider for Spark autologging
        from mlflow.tracking.context.registry import _run_context_provider_registry

        _run_context_provider_registry.register(SparkAutologgingContext)
예제 #8
0
def autolog():
    def __init__(self, *args, **kwargs):
        original = gorilla.get_original_attribute(SparkSession, "__init__")
        original(self, *args, **kwargs)

        _listen_for_spark_activity(self._sc)

    wrap_patch(SparkSession, "__init__", __init__)

    active_session = _get_active_spark_session()
    if active_session is not None:
        # We know SparkContext exists here already, so get it
        sc = SparkContext.getOrCreate()

        _listen_for_spark_activity(sc)
예제 #9
0
def get_cluster_id():
    spark_session = _get_active_spark_session()
    if spark_session is None:
        return None
    return spark_session.conf.get("spark.databricks.clusterUsageTags.clusterId")
예제 #10
0
def autolog(disable=False, silent=False):  # pylint: disable=unused-argument
    """
    Enables (or disables) and configures logging of Spark datasource paths, versions
    (if applicable), and formats when they are read. This method is not threadsafe and assumes a
    `SparkSession
    <https://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.SparkSession>`_
    already exists with the
    `mlflow-spark JAR
    <http://mlflow.org/docs/latest/tracking.html#automatic-logging-from-spark-experimental>`_
    attached. It should be called on the Spark driver, not on the executors (i.e. do not call
    this method within a function parallelized by Spark). This API requires Spark 3.0 or above.

    Datasource information is cached in memory and logged to all subsequent MLflow runs,
    including the active MLflow run (if one exists when the data is read). Note that autologging of
    Spark ML (MLlib) models is not currently supported via this API. Datasource autologging is
    best-effort, meaning that if Spark is under heavy load or MLflow logging fails for any reason
    (e.g., if the MLflow server is unavailable), logging may be dropped.

    For any unexpected issues with autologging, check Spark driver and executor logs in addition
    to stderr & stdout generated from your MLflow code - datasource information is pulled from
    Spark, so logs relevant to debugging may show up amongst the Spark logs.

    .. code-block:: python
        :caption: Example

        import mlflow.spark
        import os
        import shutil
        from pyspark.sql import SparkSession
        # Create and persist some dummy data
        # Note: On environments like Databricks with pre-created SparkSessions,
        # ensure the org.mlflow:mlflow-spark:1.11.0 is attached as a library to
        # your cluster
        spark = (SparkSession.builder
                    .config("spark.jars.packages", "org.mlflow:mlflow-spark:1.11.0")
                    .master("local[*]")
                    .getOrCreate())
        df = spark.createDataFrame([
                (4, "spark i j k"),
                (5, "l m n"),
                (6, "spark hadoop spark"),
                (7, "apache hadoop")], ["id", "text"])
        import tempfile
        tempdir = tempfile.mkdtemp()
        df.write.csv(os.path.join(tempdir, "my-data-path"), header=True)
        # Enable Spark datasource autologging.
        mlflow.spark.autolog()
        loaded_df = spark.read.csv(os.path.join(tempdir, "my-data-path"),
                        header=True, inferSchema=True)
        # Call toPandas() to trigger a read of the Spark datasource. Datasource info
        # (path and format) is logged to the current active run, or the
        # next-created MLflow run if no run is currently active
        with mlflow.start_run() as active_run:
            pandas_df = loaded_df.toPandas()

    :param disable: If ``True``, disables the Spark datasource autologging integration.
                    If ``False``, enables the Spark datasource autologging integration.
    :param silent: If ``True``, suppress all event logs and warnings from MLflow during Spark
                   datasource autologging. If ``False``, show all events and warnings during Spark
                   datasource autologging.
    """
    from mlflow.utils._spark_utils import _get_active_spark_session
    from mlflow._spark_autologging import _listen_for_spark_activity
    from pyspark.sql import SparkSession
    from pyspark import SparkContext

    def __init__(original, self, *args, **kwargs):
        original(self, *args, **kwargs)

        _listen_for_spark_activity(self._sc)

    safe_patch(FLAVOR_NAME,
               SparkSession,
               "__init__",
               __init__,
               manage_run=False)

    active_session = _get_active_spark_session()
    if active_session is not None:
        # We know SparkContext exists here already, so get it
        sc = SparkContext.getOrCreate()

        _listen_for_spark_activity(sc)
예제 #11
0
def get_nfs_cache_root_dir():
    # TODO: create isolated path for each user
    if is_in_databricks_runtime():
        return "/local_disk0/.ephemeral_nfs/mlflow/cache"
    else:
        return _get_active_spark_session().conf.get("spark.mlflow.nfs.rootDir", None)