def test_uri_types(): assert is_local_uri("mlruns") assert is_local_uri("./mlruns") assert is_local_uri("file:///foo/mlruns") assert is_local_uri("file:foo/mlruns") assert not is_local_uri("https://whatever") assert not is_local_uri("http://whatever") assert not is_local_uri("databricks") assert not is_local_uri("databricks:whatever") assert not is_local_uri("databricks://whatever") assert is_databricks_uri("databricks") assert is_databricks_uri("databricks:whatever") assert is_databricks_uri("databricks://whatever") assert not is_databricks_uri("mlruns") assert not is_databricks_uri("http://whatever") assert is_http_uri("http://whatever") assert is_http_uri("https://whatever") assert not is_http_uri("file://whatever") assert not is_http_uri("databricks://whatever") assert not is_http_uri("mlruns")
def __init__(self, db_uri, default_artifact_root): """ Create a database backed store. :param db_uri: The SQLAlchemy database URI string to connect to the database. See the `SQLAlchemy docs <https://docs.sqlalchemy.org/en/latest/core/engines.html#database-urls>`_ for format specifications. Mlflow supports the dialects ``mysql``, ``mssql``, ``sqlite``, and ``postgresql``. :param default_artifact_root: Path/URI to location suitable for large data (such as a blob store object, DBFS path, or shared NFS file system). """ super(SqlAlchemyStore, self).__init__() self.db_uri = db_uri self.db_type = extract_db_type_from_uri(db_uri) self.artifact_root_uri = default_artifact_root self.engine = kiwi.store.db.utils.create_sqlalchemy_engine(db_uri) # On a completely fresh MLflow installation against an empty database (verify database # emptiness by checking that 'experiments' etc aren't in the list of table names), run all # DB migrations expected_tables = [ SqlExperiment.__tablename__, SqlRun.__tablename__, SqlMetric.__tablename__, SqlParam.__tablename__, SqlTag.__tablename__, SqlExperimentTag.__tablename__, SqlLatestMetric.__tablename__, ] inspected_tables = set( sqlalchemy.inspect(self.engine).get_table_names()) if any([table not in inspected_tables for table in expected_tables]): kiwi.store.db.utils._initialize_tables(self.engine) Base.metadata.bind = self.engine SessionMaker = sqlalchemy.orm.sessionmaker(bind=self.engine) self.ManagedSessionMaker = kiwi.store.db.utils._get_managed_session_maker( SessionMaker, self.db_type) kiwi.store.db.utils._verify_schema(self.engine) if is_local_uri(default_artifact_root): mkdir(local_file_uri_to_path(default_artifact_root)) if len(self.list_experiments()) == 0: with self.ManagedSessionMaker() as session: self._create_default_experiment(session)
def server(backend_store_uri, default_artifact_root, host, port, workers, static_prefix, gunicorn_opts, waitress_opts, expose_prometheus): """ Run the MLflow tracking server. The server which listen on http://localhost:5000 by default, and only accept connections from the local machine. To let the server accept connections from other machines, you will need to pass ``--host 0.0.0.0`` to listen on all network interfaces (or a specific interface address). """ _validate_server_args(gunicorn_opts=gunicorn_opts, workers=workers, waitress_opts=waitress_opts) # Ensure that both backend_store_uri and default_artifact_uri are set correctly. if not backend_store_uri: backend_store_uri = DEFAULT_LOCAL_FILE_AND_ARTIFACT_PATH if not default_artifact_root: if is_local_uri(backend_store_uri): default_artifact_root = backend_store_uri else: eprint( "Option 'default-artifact-root' is required, when backend store is not " "local file based.") sys.exit(1) try: initialize_backend_stores(backend_store_uri, default_artifact_root) except Exception as e: # pylint: disable=broad-except _logger.error("Error initializing backend store") _logger.exception(e) sys.exit(1) try: _run_server(backend_store_uri, default_artifact_root, host, port, static_prefix, workers, gunicorn_opts, waitress_opts, expose_prometheus) except ShellCommandException: eprint( "Running the mlflow server failed. Please see the logs above for details." ) sys.exit(1)
def ui(backend_store_uri, default_artifact_root, port, host): """ Launch the MLflow tracking UI for local viewing of run results. To launch a production server, use the "mlflow server" command instead. The UI will be visible at http://localhost:5000 by default, and only accept connections from the local machine. To let the UI server accept connections from other machines, you will need to pass ``--host 0.0.0.0`` to listen on all network interfaces (or a specific interface address). """ # Ensure that both backend_store_uri and default_artifact_uri are set correctly. if not backend_store_uri: backend_store_uri = DEFAULT_LOCAL_FILE_AND_ARTIFACT_PATH if not default_artifact_root: if is_local_uri(backend_store_uri): default_artifact_root = backend_store_uri else: default_artifact_root = DEFAULT_LOCAL_FILE_AND_ARTIFACT_PATH try: initialize_backend_stores(backend_store_uri, default_artifact_root) except Exception as e: # pylint: disable=broad-except _logger.error("Error initializing backend store") _logger.exception(e) sys.exit(1) # TODO: We eventually want to disable the write path in this version of the server. try: _run_server(backend_store_uri, default_artifact_root, host, port, None, 1) except ShellCommandException: eprint( "Running the mlflow server failed. Please see the logs above for details." ) sys.exit(1)
def log_model(spark_model, artifact_path, conda_env=None, dfs_tmpdir=None, sample_input=None, registered_model_name=None, signature: ModelSignature = None, input_example: ModelInputExample = None): """ Log a Spark MLlib model as an MLflow artifact for the current run. This uses the MLlib persistence format and produces an MLflow Model with the Spark flavor. Note: If no run is active, it will instantiate a run to obtain a run_id. :param spark_model: Spark model to be saved - MLflow can only save descendants of pyspark.ml.Model which implement MLReadable and MLWritable. :param artifact_path: Run relative artifact path. :param conda_env: Either a dictionary representation of a Conda environment or the path to a Conda environment yaml file. If provided, this decsribes the environment this model should be run in. At minimum, it should specify the dependencies contained in :func:`get_default_conda_env()`. If `None`, the default :func:`get_default_conda_env()` environment is added to the model. The following is an *example* dictionary representation of a Conda environment:: { 'name': 'mlflow-env', 'channels': ['defaults'], 'dependencies': [ 'python=3.7.0', 'pyspark=2.3.0' ] } :param dfs_tmpdir: Temporary directory path on Distributed (Hadoop) File System (DFS) or local filesystem if running in local mode. The model is written in this destination and then copied into the model's artifact directory. This is necessary as Spark ML models read from and write to DFS if running on a cluster. If this operation completes successfully, all temporary files created on the DFS are removed. Defaults to ``/tmp/mlflow``. :param sample_input: A sample input used to add the MLeap flavor to the model. This must be a PySpark DataFrame that the model can evaluate. If ``sample_input`` is ``None``, the MLeap flavor is not added. :param registered_model_name: (Experimental) If given, create a model version under ``registered_model_name``, also creating a registered model if one with the given name does not exist. :param signature: (Experimental) :py:class:`ModelSignature <mlflow.models.ModelSignature>` describes model input and output :py:class:`Schema <mlflow.types.Schema>`. The model signature can be :py:func:`inferred <mlflow.models.infer_signature>` from datasets with valid model input (e.g. the training dataset with target column omitted) and valid model output (e.g. model predictions generated on the training dataset), for example: .. code-block:: python from mlflow.models.signature import infer_signature train = df.drop_column("target_label") predictions = ... # compute model predictions signature = infer_signature(train, predictions) :param input_example: (Experimental) Input example provides one or several instances of valid model input. The example can be used as a hint of what data to feed the model. The given example will be converted to a Pandas DataFrame and then serialized to json using the Pandas split-oriented format. Bytes are base64-encoded. .. code-block:: python :caption: Example from pyspark.ml import Pipeline from pyspark.ml.classification import LogisticRegression from pyspark.ml.feature import HashingTF, Tokenizer training = spark.createDataFrame([ (0, "a b c d e spark", 1.0), (1, "b d", 0.0), (2, "spark f g h", 1.0), (3, "hadoop mapreduce", 0.0) ], ["id", "text", "label"]) tokenizer = Tokenizer(inputCol="text", outputCol="words") hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features") lr = LogisticRegression(maxIter=10, regParam=0.001) pipeline = Pipeline(stages=[tokenizer, hashingTF, lr]) model = pipeline.fit(training) mlflow.spark.log_model(model, "spark-model") """ from py4j.protocol import Py4JJavaError _validate_model(spark_model) from pyspark.ml import PipelineModel if not isinstance(spark_model, PipelineModel): spark_model = PipelineModel([spark_model]) run_id = kiwi.tracking.fluent._get_or_start_run().info.run_id run_root_artifact_uri = kiwi.get_artifact_uri() # If the artifact URI is a local filesystem path, defer to Model.log() to persist the model, # since Spark may not be able to write directly to the driver's filesystem. For example, # writing to `file:/uri` will write to the local filesystem from each executor, which will # be incorrect on multi-node clusters - to avoid such issues we just use the Model.log() path # here. if is_local_uri(run_root_artifact_uri): return Model.log(artifact_path=artifact_path, flavor=kiwi.spark, spark_model=spark_model, conda_env=conda_env, dfs_tmpdir=dfs_tmpdir, sample_input=sample_input, registered_model_name=registered_model_name) # If Spark cannot write directly to the artifact repo, defer to Model.log() to persist the # model model_dir = os.path.join(run_root_artifact_uri, artifact_path) try: spark_model.save(os.path.join(model_dir, _SPARK_MODEL_PATH_SUB)) except Py4JJavaError: return Model.log(artifact_path=artifact_path, flavor=kiwi.spark, spark_model=spark_model, conda_env=conda_env, dfs_tmpdir=dfs_tmpdir, sample_input=sample_input, registered_model_name=registered_model_name, signature=signature, input_example=input_example) # Otherwise, override the default model log behavior and save model directly to artifact repo mlflow_model = Model(artifact_path=artifact_path, run_id=run_id) with TempDir() as tmp: tmp_model_metadata_dir = tmp.path() _save_model_metadata(tmp_model_metadata_dir, spark_model, mlflow_model, sample_input, conda_env, signature=signature, input_example=input_example) kiwi.tracking.fluent.log_artifacts(tmp_model_metadata_dir, artifact_path) if registered_model_name is not None: kiwi.register_model("runs:/%s/%s" % (run_id, artifact_path), registered_model_name)