Пример #1
0
def _save_model_metadata(dst_dir, spark_model, mlflow_model, sample_input,
                         conda_env):
    """
    Saves model metadata into the passed-in directory. The persisted metadata assumes that a
    model can be loaded from a relative path to the metadata file (currently hard-coded to
    "sparkml").
    """
    if sample_input is not None:
        mleap.add_to_model(mlflow_model, dst_dir, spark_model, sample_input)

    pyspark_version = pyspark.version.__version__

    conda_env_subpath = "conda.yaml"
    if conda_env is None:
        conda_env = DEFAULT_CONDA_ENV
    elif not isinstance(conda_env, dict):
        with open(conda_env, "r") as f:
            conda_env = yaml.safe_load(f)
    with open(os.path.join(dst_dir, conda_env_subpath), "w") as f:
        yaml.safe_dump(conda_env, stream=f, default_flow_style=False)

    mlflow_model.add_flavor(FLAVOR_NAME,
                            pyspark_version=pyspark_version,
                            model_data=_SPARK_MODEL_PATH_SUB)
    pyfunc.add_to_model(mlflow_model,
                        loader_module="mlflow.spark",
                        data=_SPARK_MODEL_PATH_SUB,
                        env=conda_env_subpath)
    mlflow_model.save(os.path.join(dst_dir, "MLmodel"))
Пример #2
0
def _save_model_metadata(dst_dir, spark_model, mlflow_model, sample_input, conda_env,
                         signature=None, input_example=None):
    """
    Saves model metadata into the passed-in directory. The persisted metadata assumes that a
    model can be loaded from a relative path to the metadata file (currently hard-coded to
    "sparkml").
    """
    import pyspark

    if sample_input is not None:
        mleap.add_to_model(mlflow_model=mlflow_model, path=dst_dir, spark_model=spark_model,
                           sample_input=sample_input)
    if signature is not None:
        mlflow_model.signature = signature
    if input_example is not None:
        _save_example(mlflow_model, input_example, dst_dir)
    conda_env_subpath = "conda.yaml"
    if conda_env is None:
        conda_env = get_default_conda_env()
    elif not isinstance(conda_env, dict):
        with open(conda_env, "r") as f:
            conda_env = yaml.safe_load(f)
    with open(os.path.join(dst_dir, conda_env_subpath), "w") as f:
        yaml.safe_dump(conda_env, stream=f, default_flow_style=False)

    mlflow_model.add_flavor(FLAVOR_NAME, pyspark_version=pyspark.__version__,
                            model_data=_SPARK_MODEL_PATH_SUB)
    pyfunc.add_to_model(mlflow_model, loader_module="mlflow.spark", data=_SPARK_MODEL_PATH_SUB,
                        env=conda_env_subpath)
    mlflow_model.save(os.path.join(dst_dir, MLMODEL_FILE_NAME))
Пример #3
0
def save_model(spark_model, path, mlflow_model=Model(), conda_env=None, jars=None,
               dfs_tmpdir=None, sample_input=None):
    """
    Save a Spark MLlib PipelineModel to a local path.

    By default, this function saves models using the Spark MLlib persistence mechanism.
    Additionally, if a sample input is specified using the ``sample_input`` parameter, the model
    is also serialized in MLeap format and the MLeap flavor is added.

    :param spark_model: Spark PipelineModel to be saved. Can save only PipelineModels.
    :param path: Local path where the model is to be saved.
    :param mlflow_model: MLflow model config this flavor is being added to.
    :param conda_env: Conda environment this model depends on.
    :param jars: List of JARs needed by the model.
    :param dfs_tmpdir: Temporary directory path on Distributed (Hadoop) File System (DFS) or local
                       filesystem if running in local mode. The model will be written in this
                       destination and then copied to the requested local path. This is necessary
                       as Spark ML models read from and write to DFS if running on a cluster. All
                       temporary files created on the DFS will be removed if this operation
                       completes successfully. Defaults to ``/tmp/mlflow``.
    :param sample_input: A sample input that will be used to add the MLeap flavor to the model.
                         This must be a PySpark DataFrame that the model can evaluate. If
                         ``sample_input`` is ``None``, the MLeap flavor is not added.

    >>> from mlflow import spark
    >>> from pyspark.ml.pipeline.PipelineModel
    >>>
    >>> #your pyspark.ml.pipeline.PipelineModel type
    >>> model = ...
    >>> mlflow.spark.save_model(model, "spark-model")
    """
    dfs_tmpdir = dfs_tmpdir if dfs_tmpdir is not None else DFS_TMP
    if jars:
        raise Exception("jar dependencies are not implemented")

    if sample_input is not None:
        mleap.add_to_model(mlflow_model, path, spark_model, sample_input)

    if not isinstance(spark_model, PipelineModel):
        raise Exception("Not a PipelineModel. SparkML can only save PipelineModels.")

    # Spark ML stores the model on DFS if running on a cluster
    # Save it to a DFS temp dir first and copy it to local path
    tmp_path = _tmp_path(dfs_tmpdir)
    spark_model.save(tmp_path)
    sparkml_data_path_sub = "sparkml"
    sparkml_data_path = os.path.abspath(os.path.join(path, sparkml_data_path_sub))
    _HadoopFileSystem.copy_to_local_file(tmp_path, sparkml_data_path, removeSrc=True)
    pyspark_version = pyspark.version.__version__
    model_conda_env = None
    if conda_env:
        model_conda_env = os.path.basename(os.path.abspath(conda_env))
        shutil.copyfile(conda_env, os.path.join(path, model_conda_env))
    mlflow_model.add_flavor(FLAVOR_NAME, pyspark_version=pyspark_version,
                            model_data=sparkml_data_path_sub)
    pyfunc.add_to_model(mlflow_model, loader_module="mlflow.spark", data=sparkml_data_path_sub,
                        env=model_conda_env)
    mlflow_model.save(os.path.join(path, "MLmodel"))
Пример #4
0
def _save_model_metadata(
    dst_dir,
    spark_model,
    mlflow_model,
    sample_input,
    conda_env,
    signature=None,
    input_example=None,
    pip_requirements=None,
    extra_pip_requirements=None,
):
    """
    Saves model metadata into the passed-in directory. The persisted metadata assumes that a
    model can be loaded from a relative path to the metadata file (currently hard-coded to
    "sparkml").
    """
    import pyspark

    if sample_input is not None:
        mleap.add_to_model(
            mlflow_model=mlflow_model,
            path=dst_dir,
            spark_model=spark_model,
            sample_input=sample_input,
        )
    if signature is not None:
        mlflow_model.signature = signature
    if input_example is not None:
        _save_example(mlflow_model, input_example, dst_dir)

    conda_env, pip_requirements, pip_constraints = (_process_pip_requirements(
        get_default_pip_requirements(),
        pip_requirements,
        extra_pip_requirements,
    ) if conda_env is None else _process_conda_env(conda_env))

    with open(os.path.join(dst_dir, _CONDA_ENV_FILE_NAME), "w") as f:
        yaml.safe_dump(conda_env, stream=f, default_flow_style=False)

    # Save `constraints.txt` if necessary
    if pip_constraints:
        write_to(os.path.join(dst_dir, _CONSTRAINTS_FILE_NAME),
                 "\n".join(pip_constraints))

    # Save `requirements.txt`
    write_to(os.path.join(dst_dir, _REQUIREMENTS_FILE_NAME),
             "\n".join(pip_requirements))

    mlflow_model.add_flavor(FLAVOR_NAME,
                            pyspark_version=pyspark.__version__,
                            model_data=_SPARK_MODEL_PATH_SUB)
    pyfunc.add_to_model(
        mlflow_model,
        loader_module="mlflow.spark",
        data=_SPARK_MODEL_PATH_SUB,
        env=_CONDA_ENV_FILE_NAME,
    )
    mlflow_model.save(os.path.join(dst_dir, MLMODEL_FILE_NAME))
Пример #5
0
def _save_model_metadata(
    dst_dir,
    spark_model,
    mlflow_model,
    sample_input,
    conda_env,
    signature=None,
    input_example=None,
    pip_requirements=None,
    extra_pip_requirements=None,
):
    """
    Saves model metadata into the passed-in directory. The persisted metadata assumes that a
    model can be loaded from a relative path to the metadata file (currently hard-coded to
    "sparkml").
    """
    import pyspark

    if sample_input is not None:
        mleap.add_to_model(
            mlflow_model=mlflow_model,
            path=dst_dir,
            spark_model=spark_model,
            sample_input=sample_input,
        )
    if signature is not None:
        mlflow_model.signature = signature
    if input_example is not None:
        _save_example(mlflow_model, input_example, dst_dir)

    mlflow_model.add_flavor(FLAVOR_NAME,
                            pyspark_version=pyspark.__version__,
                            model_data=_SPARK_MODEL_PATH_SUB)
    pyfunc.add_to_model(
        mlflow_model,
        loader_module="mlflow.spark",
        data=_SPARK_MODEL_PATH_SUB,
        env=_CONDA_ENV_FILE_NAME,
    )
    mlflow_model.save(os.path.join(dst_dir, MLMODEL_FILE_NAME))

    if conda_env is None:
        if pip_requirements is None:
            default_reqs = get_default_pip_requirements()
            # To ensure `_load_pyfunc` can successfully load the model during the dependency
            # inference, `mlflow_model.save` must be called beforehand to save an MLmodel file.
            inferred_reqs = mlflow.models.infer_pip_requirements(
                dst_dir,
                FLAVOR_NAME,
                fallback=default_reqs,
            )
            default_reqs = sorted(set(inferred_reqs).union(default_reqs))
        else:
            default_reqs = None
        conda_env, pip_requirements, pip_constraints = _process_pip_requirements(
            default_reqs,
            pip_requirements,
            extra_pip_requirements,
        )
    else:
        conda_env, pip_requirements, pip_constraints = _process_conda_env(
            conda_env)

    with open(os.path.join(dst_dir, _CONDA_ENV_FILE_NAME), "w") as f:
        yaml.safe_dump(conda_env, stream=f, default_flow_style=False)

    # Save `constraints.txt` if necessary
    if pip_constraints:
        write_to(os.path.join(dst_dir, _CONSTRAINTS_FILE_NAME),
                 "\n".join(pip_constraints))

    # Save `requirements.txt`
    write_to(os.path.join(dst_dir, _REQUIREMENTS_FILE_NAME),
             "\n".join(pip_requirements))
Пример #6
0
def save_model(spark_model, path, mlflow_model=Model(), conda_env=None, jars=None,
               dfs_tmpdir=None, sample_input=None):
    """
    Save a Spark MLlib PipelineModel to a local path.

    By default, this function saves models using the Spark MLlib persistence mechanism.
    Additionally, if a sample input is specified using the ``sample_input`` parameter, the model
    is also serialized in MLeap format and the MLeap flavor is added.

    :param spark_model: Spark PipelineModel to be saved. Can save only PipelineModels.
    :param path: Local path where the model is to be saved.
    :param mlflow_model: MLflow model config this flavor is being added to.
    :param conda_env: Either a dictionary representation of a Conda environment or the path to a
                      Conda environment yaml file. If provided, this decribes the environment
                      this model should be run in. At minimum, it should specify the dependencies
                      contained in ``mlflow.spark.DEFAULT_CONDA_ENV``. If `None`, the default
                      ``mlflow.spark.DEFAULT_CONDA_ENV`` environment will be added to the model.
                      The following is an *example* dictionary representation of a Conda
                      environment::

                        {
                            'name': 'mlflow-env',
                            'channels': ['defaults'],
                            'dependencies': [
                                'python=3.7.0',
                                'pyspark=2.3.0'
                            ]
                        }

    :param jars: List of JARs needed by the model.
    :param dfs_tmpdir: Temporary directory path on Distributed (Hadoop) File System (DFS) or local
                       filesystem if running in local mode. The model will be written in this
                       destination and then copied to the requested local path. This is necessary
                       as Spark ML models read from and write to DFS if running on a cluster. All
                       temporary files created on the DFS will be removed if this operation
                       completes successfully. Defaults to ``/tmp/mlflow``.
    :param sample_input: A sample input that will be used to add the MLeap flavor to the model.
                         This must be a PySpark DataFrame that the model can evaluate. If
                         ``sample_input`` is ``None``, the MLeap flavor is not added.

    >>> from mlflow import spark
    >>> from pyspark.ml.pipeline.PipelineModel
    >>>
    >>> #your pyspark.ml.pipeline.PipelineModel type
    >>> model = ...
    >>> mlflow.spark.save_model(model, "spark-model")
    """
    if jars:
        raise Exception("jar dependencies are not implemented")

    if sample_input is not None:
        mleap.add_to_model(mlflow_model, path, spark_model, sample_input)

    if not isinstance(spark_model, PipelineModel):
        raise Exception("Not a PipelineModel. SparkML can only save PipelineModels.")

    # Spark ML stores the model on DFS if running on a cluster
    # Save it to a DFS temp dir first and copy it to local path
    if dfs_tmpdir is None:
        dfs_tmpdir = DFS_TMP
    tmp_path = _tmp_path(dfs_tmpdir)
    spark_model.save(tmp_path)
    sparkml_data_path_sub = "sparkml"
    sparkml_data_path = os.path.abspath(os.path.join(path, sparkml_data_path_sub))
    _HadoopFileSystem.copy_to_local_file(tmp_path, sparkml_data_path, remove_src=True)
    pyspark_version = pyspark.version.__version__

    conda_env_subpath = "conda.yaml"
    if conda_env is None:
        conda_env = DEFAULT_CONDA_ENV
    elif not isinstance(conda_env, dict):
        with open(conda_env, "r") as f:
            conda_env = yaml.safe_load(f)
    with open(os.path.join(path, conda_env_subpath), "w") as f:
        yaml.safe_dump(conda_env, stream=f, default_flow_style=False)

    mlflow_model.add_flavor(FLAVOR_NAME, pyspark_version=pyspark_version,
                            model_data=sparkml_data_path_sub)
    pyfunc.add_to_model(mlflow_model, loader_module="mlflow.spark", data=sparkml_data_path_sub,
                        env=conda_env_subpath)
    mlflow_model.save(os.path.join(path, "MLmodel"))