Пример #1
0
def main(config: Config) -> None:
    MLFLOW_ARTIFACT_LOCATION = os.environ.get(
        "MLFLOW_ARTIFACT_LOCATION", "./data/processed/mlruns/artifacts"
    )

    client = mlf_tracking.MlflowClient()
    try:
        _ = client.create_experiment(
            config.experiment_name,
            artifact_location=MLFLOW_ARTIFACT_LOCATION,
        )
    except mlf_exceptions.MlflowException:
        pass

    params = {}
    if config.git_version is not None:
        params["version"] = config.git_version

    mlf_projects.run(
        config.uri,
        entry_point="main",
        experiment_name=config.experiment_name,
        use_conda=False,
        **params,
    )
Пример #2
0
def run(uri, entry_point, version, param_list, experiment_id, mode, cluster_spec, git_username,
        git_password, no_conda, new_dir, storage_dir):
    """
    Run an MLflow project from the given URI.

    If running locally (the default), the URI can be either a Git repository URI or a local path.
    If running on Databricks, the URI must be a Git repository.

    By default, Git projects will run in a new working directory with the given parameters, while
    local projects will run from the project's root directory.
    """
    param_dict = {}
    for s in param_list:
        index = s.find("=")
        if index == -1:
            print("Invalid format for -P parameter: '%s'. Use -P name=value." % s, file=sys.stderr)
            sys.exit(1)
        name = s[:index]
        value = s[index + 1:]
        if name in param_dict:
            print("Repeated parameter: '%s'" % name, file=sys.stderr)
            sys.exit(1)
        param_dict[_encode(name)] = _encode(value)
    try:
        projects.run(_encode(uri), _encode(entry_point), _encode(version),
                     experiment_id=experiment_id,
                     parameters=param_dict, mode=_encode(mode),
                     cluster_spec=_encode(cluster_spec),
                     git_username=_encode(git_username),
                     git_password=_encode(git_password), use_conda=(not no_conda),
                     use_temp_cwd=new_dir, storage_dir=_encode(storage_dir))
    except projects.ExecutionException as e:
        print(e.message, file=sys.stderr)
        sys.exit(1)
Пример #3
0
def test_dnn():
    old_uri = tracking.get_tracking_uri()
    try:
        with TempDir(chdr=False, remove_on_exit=True) as tmp:
            diamonds = tmp.path("diamonds")
            estimator = tmp.path("estimator")
            artifacts = tmp.path("artifacts")
            os.mkdir(diamonds)
            os.mkdir(estimator)
            os.mkdir(artifacts)
            tracking.set_tracking_uri(artifacts)
            # Download the diamonds dataset via mlflow run
            run(".",
                entry_point="main",
                version=None,
                parameters={"dest-dir": diamonds},
                experiment_id=tracking._get_experiment_id(),
                mode="local",
                cluster_spec=None,
                git_username=None,
                git_password=None,
                use_conda=True,
                storage_dir=None)

            # Run the main dnn app via mlflow
            run("apps/dnn-regression",
                entry_point="main",
                version=None,
                parameters={
                    "model-dir": estimator,
                    "train": os.path.join(diamonds, "train_diamonds.parquet"),
                    "test": os.path.join(diamonds, "test_diamonds.parquet"),
                    "hidden-units": "30,30",
                    "label-col": "price",
                    "steps": 5000,
                    "batch-size": 128
                },
                experiment_id=tracking._get_experiment_id(),
                mode="local",
                cluster_spec=None,
                git_username=None,
                git_password=None,
                use_conda=True,
                storage_dir=None)

            # Loading the saved model as a pyfunc.
            pyfunc = tensorflow.load_pyfunc(
                os.path.join(estimator,
                             os.listdir(estimator)[0]))

            df = pandas.read_parquet(
                os.path.join(diamonds, "test_diamonds.parquet"))

            predict_df = pyfunc.predict(df)
            assert 'predictions' in predict_df
            assert isinstance(predict_df['predictions'][0][0], numpy.float32)
    finally:
        tracking.set_tracking_uri(old_uri)
Пример #4
0
def run(uri, entry_point, version, param_list, experiment_id, mode,
        cluster_spec, git_username, git_password, no_conda, storage_dir,
        run_id):
    """
    Run an MLflow project from the given URI.

    For local runs, the run will block until it completes.
    Otherwise, the project will run asynchronously.

    If running locally (the default), the URI can be either a Git repository URI or a local path.
    If running on Databricks, the URI must be a Git repository.

    By default, Git projects run in a new working directory with the given parameters, while
    local projects run from the project's root directory.
    """
    param_dict = {}
    for s in param_list:
        index = s.find("=")
        if index == -1:
            print("Invalid format for -P parameter: '%s'. Use -P name=value." %
                  s,
                  file=sys.stderr)
            sys.exit(1)
        name = s[:index]
        value = s[index + 1:]
        if name in param_dict:
            print("Repeated parameter: '%s'" % name, file=sys.stderr)
            sys.exit(1)
        param_dict[name] = value
    cluster_spec_arg = cluster_spec
    if cluster_spec is not None and os.path.splitext(
            cluster_spec)[-1] != ".json":
        try:
            cluster_spec_arg = json.loads(cluster_spec)
        except ValueError as e:
            print("Invalid cluster spec JSON. Parse error: %s" % e)
            raise
    try:
        projects.run(
            uri,
            entry_point,
            version,
            experiment_id=experiment_id,
            parameters=param_dict,
            mode=mode,
            cluster_spec=cluster_spec_arg,
            git_username=git_username,
            git_password=git_password,
            use_conda=(not no_conda),
            storage_dir=storage_dir,
            block=mode == "local" or mode is None,
            run_id=run_id,
        )
    except projects.ExecutionException as e:
        _logger.error("=== %s ===", e)
        sys.exit(1)
Пример #5
0
def test_gbt():
    old_uri = tracking.get_tracking_uri()
    with TempDir(chdr=False, remove_on_exit=True) as tmp:
        try:
            diamonds = tmp.path("diamonds")
            artifacts = tmp.path("artifacts")
            os.mkdir(diamonds)
            os.mkdir(artifacts)
            tracking.set_tracking_uri(artifacts)
            # Download the diamonds dataset via mlflow run
            run(".", entry_point="main", version=None,
                parameters={"dest-dir": diamonds}, experiment_id=0,
                mode="local", cluster_spec=None, git_username=None, git_password=None,
                use_conda=True, storage_dir=None)

            initial = os.path.join(artifacts, "0")
            dir_list = os.listdir(initial)

            # Run the main gbt app via mlflow
            run("apps/gbt-regression", entry_point="main", version=None,
                parameters={"train": os.path.join(diamonds, "train_diamonds.parquet"),
                            "test": os.path.join(diamonds, "test_diamonds.parquet"),
                            "n-trees": 10,
                            "m-depth": 3,
                            "learning-rate": .1,
                            "loss": "rmse",
                            "label-col": "price"},
                experiment_id=0, mode="local",
                cluster_spec=None, git_username=None, git_password=None, use_conda=True,
                storage_dir=None)

            # Identifying the new run's folder
            main = None
            for item in os.listdir(initial):
                if item not in dir_list:
                    main = item

            pyfunc = load_pyfunc(os.path.join(initial, main, "artifacts/model/model.pkl"))
            df = pandas.read_parquet(os.path.join(diamonds, "test_diamonds.parquet"))

            # Removing the price column from the DataFrame so we can use the features to predict
            df = df.drop(columns="price")

            # Predicting from the saved pyfunc
            predict = pyfunc.predict(df)

            # Make sure the data is of the right type
            assert isinstance(predict[0], numpy.float32)
        finally:
            tracking.set_tracking_uri(old_uri)
Пример #6
0
def run(uri, entry_point, version, param_list, experiment_id, mode,
        cluster_spec, git_username, git_password, no_conda, new_dir,
        storage_dir, run_id):
    """
    Run an MLflow project from the given URI.

    Blocks till the run completes.

    If running locally (the default), the URI can be either a Git repository URI or a local path.
    If running on Databricks, the URI must be a Git repository.

    By default, Git projects run in a new working directory with the given parameters, while
    local projects run from the project's root directory.
    """
    param_dict = {}
    for s in param_list:
        index = s.find("=")
        if index == -1:
            print("Invalid format for -P parameter: '%s'. Use -P name=value." %
                  s,
                  file=sys.stderr)
            sys.exit(1)
        name = s[:index]
        value = s[index + 1:]
        if name in param_dict:
            print("Repeated parameter: '%s'" % name, file=sys.stderr)
            sys.exit(1)
        param_dict[name] = value
    try:
        projects.run(
            uri,
            entry_point,
            version,
            experiment_id=experiment_id,
            parameters=param_dict,
            mode=mode,
            cluster_spec=cluster_spec,
            git_username=git_username,
            git_password=git_password,
            use_conda=(not no_conda),
            use_temp_cwd=new_dir,
            storage_dir=storage_dir,
            block=True,
            run_id=run_id,
        )
    except projects.ExecutionException:
        import traceback
        traceback.print_exc(file=sys.stderr)
        sys.exit(1)
def reproduce_run(run_id, experiment_name, rel_tol=1e-09, verbose=False):
    # Get target run
    run1 = client.get_run(run_id)
    dump_run(run1,"Target Run", verbose)
    uri = get_tag(run1, mlflow_tags.MLFLOW_SOURCE_NAME)
    print(f"git_uri: {mlflow_tags.MLFLOW_SOURCE_NAME}: {uri}")
    version = get_tag(run1, mlflow_tags.MLFLOW_GIT_COMMIT)
    print("version:",version)

    # Execute the run - reproduced run
    res = projects.run(uri, parameters=run1.data.params, version=version, experiment_name=experiment_name)
    print("Reproduced Run Result:")
    print("  run_id:",res.run_id)
    print("  get_status:",res.get_status())

    # Print results of reproduced run
    run2 = client.get_run(res.run_id)
    dump_run(run2, "Reproduced Run", verbose)

    # Print metrics comparison between target and reproduced run
    data = [ [k,v, run2.data.metrics[k]] for k,v in run1.data.metrics.items() ]
    df = pd.DataFrame(data, columns = ["Metric","Run1", "Run2"])
    print()
    print(tabulate(df, headers="keys", tablefmt="psql", showindex=False))
    eq = runs_equal(run1, run2, rel_tol)
    print("Runs equal:",eq)
Пример #8
0
def test_simple_run_pip_synchronous():
    submitted_run = projects.run(os.path.join(os.path.dirname(__file__),
                                              "resources", "pip_project"),
                                 backend="yarn",
                                 entry_point="compute_intersection",
                                 parameters={"size": 10003},
                                 synchronous=True)

    # failure launches ExecutionException without the infos of the active run
    _check_merged_logs(submitted_run.skein_app_id, "Time taken in secs:", True)
Пример #9
0
def test_simple_run_pip():
    submitted_run = projects.run(os.path.join(os.path.dirname(__file__),
                                              "resources", "pip_project"),
                                 backend="yarn",
                                 entry_point="compute_intersection",
                                 parameters={"size": 10002},
                                 synchronous=False)
    result_status = submitted_run.wait()

    _check_merged_logs(submitted_run.skein_app_id, "Time taken in secs:",
                       result_status)
Пример #10
0
def test_linear():
    old_uri = tracking.get_tracking_uri()
    with TempDir(chdr=False, remove_on_exit=True) as tmp:
        try:
            diamonds = tmp.path("diamonds")
            root_tracking_dir = tmp.path("root_tracking_dir")
            os.mkdir(diamonds)
            os.mkdir(root_tracking_dir)
            tracking.set_tracking_uri(root_tracking_dir)
            # Download the diamonds dataset via mlflow run
            mlflow.set_experiment("test-experiment")
            run(".",
                entry_point="main",
                version=None,
                parameters={"dest-dir": diamonds},
                mode="local",
                cluster_spec=None,
                git_username=None,
                git_password=None,
                use_conda=True,
                storage_dir=None)

            # Run the main linear app via mlflow
            submitted_run = run("apps/linear-regression",
                                entry_point="main",
                                version=None,
                                parameters={
                                    "train":
                                    os.path.join(diamonds,
                                                 "train_diamonds.parquet"),
                                    "test":
                                    os.path.join(diamonds,
                                                 "test_diamonds.parquet"),
                                    "alpha":
                                    .001,
                                    "l1-ratio":
                                    .5,
                                    "label-col":
                                    "price"
                                },
                                mode="local",
                                cluster_spec=None,
                                git_username=None,
                                git_password=None,
                                use_conda=True,
                                storage_dir=None)

            pyfunc = load_pyfunc(path="model", run_id=submitted_run.run_id)

            df = pandas.read_parquet(
                os.path.join(diamonds, "test_diamonds.parquet"))

            # Removing the price column from the DataFrame so we can use the features to predict
            df = df.drop(columns="price")

            # Predicting from the saved pyfunc
            predict = pyfunc.predict(df)

            # Make sure the data is of the right type
            assert isinstance(predict[0], numpy.float64)
        finally:
            tracking.set_tracking_uri(old_uri)
Пример #11
0
# COMMAND ----------

# MAGIC %md Use MLflow Fluent API

# COMMAND ----------

res_sub = mlflow.run("https://github.com/mlflow/mlflow-example",
                     parameters={
                         "alpha": 0.6,
                         "l1_ratio": 0.1
                     })
print(f"status={res_sub.get_status()}")
print(f"run_id={res_sub.run_id}")

# COMMAND ----------

# MAGIC %md Use MLflow Projects API

# COMMAND ----------

import mlflow
res_sub = projects.run(
    "https://github.com/dmatrix/mlflow-workshop-project-expamle-1",
    parameters={
        'batch_size': 5,
        'epochs': 1000
    })
print(f"status={res_sub.get_status()}")
print(f"run_id={res_sub.run_id}")
Пример #12
0
#!/usr/bin/env python3
# -*- coding:utf-8 -*-
# datetime:2019/1/11 13:12
import os
import mlflow
from mlflow import projects
if __name__ == "__main__":
    path = "e:\\github\\mlflow\\examples\\sklearn_elasticnet_wine"
    relative_path = "../../../github//mlflow//examples//sklearn_elasticnet_wine"
    projects.run(uri=path,
                 parameters={"alpha": 0.5},
                 use_conda=False,
                 experiment_id="2")