示例#1
0
def _run_local(uri, entry_point, version, parameters, experiment_id, use_conda, use_temp_cwd,
               storage_dir, git_username, git_password):
    """
    Run an MLflow project from the given URI in a new directory.

    Supports downloading projects from Git URIs with a specified version, or copying them from
    the file system. For Git-based projects, a commit can be specified as the `version`.
    """
    eprint("=== Fetching project from %s ===" % uri)

    # Get the working directory to use for running the project & download it there
    work_dir = _get_work_dir(uri, use_temp_cwd)
    eprint("=== Work directory for this run: %s ===" % work_dir)
    expanded_uri = _expand_uri(uri)
    _fetch_project(expanded_uri, version, work_dir, git_username, git_password)

    # Load the MLproject file
    if not os.path.isfile(os.path.join(work_dir, "MLproject")):
        raise ExecutionException("No MLproject file found in %s" % uri)
    project = Project(expanded_uri, file_utils.read_yaml(work_dir, "MLproject"))
    _run_project(project, entry_point, work_dir, parameters, use_conda, storage_dir, experiment_id)
示例#2
0
def _create_databricks_run(tracking_uri, experiment_id, source_name,
                           source_version, entry_point_name):
    """
    Makes an API request to the specified tracking server to create a new run with the specified
    attributes. Returns an `ActiveRun` that can be used to query the tracking server for the run's
    status or log metrics/params for the run.
    """
    if tracking.is_local_uri(tracking_uri):
        # TODO: we'll actually use the Databricks deployment's tracking URI here in the future
        eprint(
            "WARNING: MLflow tracking URI is set to a local URI (%s), so results from "
            "Databricks will not be logged permanently." % tracking_uri)
        return None
    else:
        # Assume non-local tracking URIs are accessible from Databricks (won't work for e.g.
        # localhost)
        return tracking._create_run(experiment_id=experiment_id,
                                    source_name=source_name,
                                    source_version=source_version,
                                    entry_point_name=entry_point_name,
                                    source_type=SourceType.PROJECT)
示例#3
0
文件: __init__.py 项目: vthily/mlflow
def _wait_for(submitted_run_obj):
    """Waits on the passed-in submitted run, reporting its status to the tracking server."""
    run_id = submitted_run_obj.run_id
    active_run = None
    # Note: there's a small chance we fail to report the run's status to the tracking server if
    # we're interrupted before we reach the try block below
    try:
        active_run = tracking._get_existing_run(
            run_id) if run_id is not None else None
        if submitted_run_obj.wait():
            eprint("=== Run (ID '%s') succeeded ===" % run_id)
            _maybe_set_run_terminated(active_run, "FINISHED")
        else:
            _maybe_set_run_terminated(active_run, "FAILED")
            raise ExecutionException("=== Run (ID '%s') failed ===" % run_id)
    except KeyboardInterrupt:
        eprint("=== Run (ID '%s') === interrupted, cancelling run ===" %
               run_id)
        submitted_run_obj.cancel()
        _maybe_set_run_terminated(active_run, "FAILED")
        raise
示例#4
0
def _get_entry_point_command(project, entry_point, parameters, conda_env_name, storage_dir):
    """
    Returns the shell command to execute in order to run the specified entry point.
    :param project: Project containing the target entry point
    :param entry_point: Entry point to run
    :param parameters: Parameters (dictionary) for the entry point command
    :param conda_env_name: Name of conda environment to use for command execution, or None if no
                           conda environment should be used.
    :param storage_dir: Base local directory to use for downloading remote artifacts passed to
                        arguments of type 'path'. If None, a temporary base directory is used.
    """
    storage_dir_for_run = _get_storage_dir(storage_dir)
    eprint("=== Created directory %s for downloading remote URIs passed to arguments of "
           "type 'path' ===" % storage_dir_for_run)
    commands = []
    if conda_env_name:
        activate_path = _get_conda_bin_executable("activate")
        commands.append("source %s %s" % (activate_path, conda_env_name))
    commands.append(
        project.get_entry_point(entry_point).compute_command(parameters, storage_dir_for_run))
    return " && ".join(commands)
示例#5
0
def _already_ran(entry_point_name, parameters, git_commit, experiment_id=None):
    experiment_id = experiment_id if experiment_id is not None else _get_experiment_id()
    client = mlflow.tracking.MlflowClient()
    all_run_infos = reversed(client.list_run_infos(experiment_id))
    for run_info in all_run_infos:
        full_run = client.get_run(run_info.run_id)
        tags = full_run.data.tags
        if tags.get(mlflow_tags.MLFLOW_PROJECT_ENTRY_POINT, None) != entry_point_name:
            continue
        match_failed = False
        for param_key, param_value in six.iteritems(parameters):
            run_value = full_run.data.params.get(param_key)
            if run_value != param_value:
                match_failed = True
                break
        if match_failed:
            continue

        if run_info.to_proto().status != RunStatus.FINISHED:
            eprint(("Run matched, but is not FINISHED, so skipping "
                    "(run_id=%s, status=%s)") % (run_info.run_id, run_info.status))
            continue

        previous_version = tags.get(mlflow_tags.MLFLOW_GIT_COMMIT, None)
        if git_commit != previous_version:
            eprint(("Run matched, but has a different source version, so skipping "
                    "(found=%s, expected=%s)") % (previous_version, git_commit))
            continue
        return client.get_run(run_info.run_id)
    eprint("No matching run has been found.")
    return None
示例#6
0
def load_model(path, run_id=None, dfs_tmpdir=None):
    """
    Load the Spark MLlib model from the path.

    :param run_id: Run ID. If provided, combined with ``path`` to identify the model.
    :param path: Local filesystem path or run-relative artifact path to the model.
    :return: SparkML model.
    :rtype: pyspark.ml.pipeline.PipelineModel

    >>> from mlflow import spark
    >>> model = mlflow.spark.load_model("spark-model")
    >>> # Prepare test documents, which are unlabeled (id, text) tuples.
    >>> test = spark.createDataFrame([
    ...   (4, "spark i j k"),
    ...   (5, "l m n"),
    ...   (6, "spark hadoop spark"),
    ...   (7, "apache hadoop")], ["id", "text"])
    >>>  # Make predictions on test documents.
    >>> prediction = model.transform(test)

    """
    dfs_tmpdir = dfs_tmpdir if dfs_tmpdir is not None else DFS_TMP
    if run_id is not None:
        path = mlflow.tracking.utils._get_model_log_dir(model_name=path,
                                                        run_id=run_id)
    m = Model.load(os.path.join(path, 'MLmodel'))
    if FLAVOR_NAME not in m.flavors:
        raise Exception("Model does not have {} flavor".format(FLAVOR_NAME))
    conf = m.flavors[FLAVOR_NAME]
    model_path = os.path.join(path, conf['model_data'])
    tmp_path = _tmp_path(dfs_tmpdir)
    # Spark ML expects the model to be stored on DFS
    # Copy the model to a temp DFS location first. We cannot delete this file, as
    # Spark may read from it at any point.
    _HadoopFileSystem.copy_from_local_file(model_path,
                                           tmp_path,
                                           removeSrc=False)
    pipeline_model = PipelineModel.load(tmp_path)
    eprint("Copied SparkML model to %s" % tmp_path)
    return pipeline_model
示例#7
0
def run_databricks(uri, entry_point, version, parameters, experiment_id,
                   cluster_spec, git_username, git_password):
    """
    Runs the project at the specified URI on Databricks, returning a `SubmittedRun` that can be
    used to query the run's status or wait for the resulting Databricks Job run to terminate.
    """
    _check_databricks_auth_available()
    if cluster_spec is None:
        raise ExecutionException(
            "Cluster spec must be provided when launching MLflow project runs "
            "on Databricks.")

    # Fetch the project into work_dir & validate parameters
    work_dir = _fetch_project(uri=uri,
                              use_temp_cwd=True,
                              version=version,
                              git_username=git_username,
                              git_password=git_password)
    project = _load_project(work_dir)
    project.get_entry_point(entry_point)._validate_parameters(parameters)
    # Upload the project to DBFS, get the URI of the project
    dbfs_project_uri = _upload_project_to_dbfs(work_dir, experiment_id)

    # Create run object with remote tracking server. Get the git commit from the working directory,
    # etc.
    tracking_uri = tracking.get_tracking_uri()
    remote_run = _create_databricks_run(
        tracking_uri=tracking_uri,
        experiment_id=experiment_id,
        source_name=_expand_uri(uri),
        source_version=tracking._get_git_commit(work_dir),
        entry_point_name=entry_point)
    # Set up environment variables for remote execution
    env_vars = {}
    if experiment_id is not None:
        eprint("=== Using experiment ID %s ===" % experiment_id)
        env_vars[tracking._EXPERIMENT_ID_ENV_VAR] = experiment_id
    if remote_run is not None:
        env_vars[tracking._TRACKING_URI_ENV_VAR] = tracking.get_tracking_uri()
        env_vars[tracking._RUN_ID_ENV_VAR] = remote_run.run_info.run_uuid
    eprint("=== Running entry point %s of project %s on Databricks. ===" %
           (entry_point, uri))
    # Launch run on Databricks
    with open(cluster_spec, 'r') as handle:
        try:
            cluster_spec = json.load(handle)
        except ValueError:
            eprint(
                "Error when attempting to load and parse JSON cluster spec from file "
                "%s. " % cluster_spec)
            raise
    fuse_dst_dir = os.path.join(
        "/dbfs/",
        _parse_dbfs_uri_path(dbfs_project_uri).lstrip("/"))
    final_run_id = remote_run.run_info.run_uuid if remote_run else None
    command = _get_databricks_run_cmd(fuse_dst_dir, final_run_id, entry_point,
                                      parameters)
    db_run_id = _run_shell_command_job(uri, command, env_vars, cluster_spec)
    run_id = remote_run.run_info.run_uuid if remote_run else None
    return DatabricksSubmittedRun(db_run_id, run_id)
示例#8
0
def _create_sagemaker_endpoint(endpoint_name, image_url, model_s3_path, run_id, instance_type,
                               instance_count, role, sage_client):
    """
    :param image_url: URL of the ECR-hosted docker image the model is being deployed into.
    :param model_s3_path: S3 path where we stored the model artifacts.
    :param run_id: Run ID that generated this model.
    :param instance_type: The type of SageMaker ML instance on which to deploy the model.
    :param instance_count: The number of SageMaker ML instances on which to deploy the model.
    :param role: SageMaker execution ARN role
    :param sage_client: A boto3 client for SageMaker
    """
    eprint("Creating new endpoint with name: {en} ...".format(
        en=endpoint_name))

    model_name = _get_sagemaker_model_name(endpoint_name)
    model_response = _create_sagemaker_model(model_name=model_name,
                                             model_s3_path=model_s3_path,
                                             run_id=run_id,
                                             image_url=image_url,
                                             execution_role=role,
                                             sage_client=sage_client)
    eprint("Created model with arn: %s" % model_response["ModelArn"])

    production_variant = {
        'VariantName': model_name,
        'ModelName': model_name,
        'InitialInstanceCount': instance_count,
        'InstanceType': instance_type,
        'InitialVariantWeight': 1,
    }
    config_name = _get_sagemaker_config_name(endpoint_name)
    endpoint_config_response = sage_client.create_endpoint_config(
        EndpointConfigName=config_name,
        ProductionVariants=[production_variant],
        Tags=[
            {
                'Key': 'app_name',
                'Value': endpoint_name,
            },
        ],
    )
    eprint("Created endpoint configuration with arn: %s"
           % endpoint_config_response["EndpointConfigArn"])

    endpoint_response = sage_client.create_endpoint(
        EndpointName=endpoint_name,
        EndpointConfigName=config_name,
        Tags=[],
    )
    eprint("Created endpoint with arn: %s" % endpoint_response["EndpointArn"])
示例#9
0
def delete(app_name, region_name="us-west-2", archive=False):
    """
    Delete the specified application.

    :param app_name: Name of the deployed application.
    :param region_name: Name of the AWS region in which the application is deployed.
    :param archive: If True, resources associated with the specified application, such
                    as its associated models and endpoint configuration, will be preserved.
                    If False, these resources will be deleted.
    """
    s3_client = boto3.client('s3', region_name=region_name)
    sage_client = boto3.client('sagemaker', region_name=region_name)

    endpoint_info = sage_client.describe_endpoint(EndpointName=app_name)
    endpoint_arn = endpoint_info["EndpointArn"]

    sage_client.delete_endpoint(EndpointName=app_name)
    eprint("Deleted endpoint with arn: {earn}".format(earn=endpoint_arn))

    if not archive:
        config_name = endpoint_info["EndpointConfigName"]
        config_info = sage_client.describe_endpoint_config(
            EndpointConfigName=config_name)
        config_arn = config_info["EndpointConfigArn"]
        sage_client.delete_endpoint_config(EndpointConfigName=config_name)
        eprint("Deleted associated endpoint configuration with arn: {carn}".
               format(carn=config_arn))
        for pv in config_info["ProductionVariants"]:
            model_name = pv["ModelName"]
            model_arn = _delete_sagemaker_model(model_name, sage_client,
                                                s3_client)
            eprint("Deleted associated model with arn: {marn}".format(
                marn=model_arn))
示例#10
0
def build_image(name=DEFAULT_IMAGE_NAME, mlflow_home=None):
    """
    This function builds an MLflow Docker image.
    The image is built locally and it requires Docker to run.

    :param name: image name
    """
    with TempDir() as tmp:
        install_mlflow = "RUN pip install mlflow=={version}".format(
            version=mlflow.version.VERSION)
        cwd = tmp.path()
        if mlflow_home:
            mlflow_dir = _copy_project(src_path=mlflow_home,
                                       dst_path=tmp.path())
            install_mlflow = ("COPY {mlflow_dir} /opt/mlflow\n"
                              "RUN cd /opt/mlflow/mlflow/java/scoring &&"
                              " mvn --batch-mode package -DskipTests \n"
                              "RUN pip install /opt/mlflow\n")
            install_mlflow = install_mlflow.format(mlflow_dir=mlflow_dir)
        else:
            eprint("`mlflow_home` was not specified. The image will install"
                   " MLflow from pip instead. As a result, the container will"
                   " not support the MLeap flavor.")

        with open(os.path.join(cwd, "Dockerfile"), "w") as f:
            f.write(_DOCKERFILE_TEMPLATE % install_mlflow)
        eprint("building docker image")
        os.system('find {cwd}/'.format(cwd=cwd))
        proc = Popen(["docker", "build", "-t", name, "-f", "Dockerfile", "."],
                     cwd=cwd,
                     stdout=PIPE,
                     stderr=STDOUT,
                     universal_newlines=True)
        for x in iter(proc.stdout.readline, ""):
            eprint(x, end='')
示例#11
0
def _run_project(project, entry_point, work_dir, parameters, use_conda,
                 storage_dir, experiment_id):
    """Locally run a project that has been checked out in `work_dir`."""
    storage_dir_for_run = _get_storage_dir(storage_dir)
    eprint(
        "=== Created directory %s for downloading remote URIs passed to arguments of "
        "type 'path' ===" % storage_dir_for_run)
    # Try to build the command first in case the user mis-specified parameters
    run_project_command = project.get_entry_point(entry_point)\
        .compute_command(parameters, storage_dir_for_run)
    commands = []
    if use_conda:
        conda_env_path = os.path.abspath(
            os.path.join(work_dir, project.conda_env))
        _maybe_create_conda_env(conda_env_path)
        commands.append("source activate %s" %
                        _get_conda_env_name(conda_env_path))

    # Create a new run and log every provided parameter into it.
    active_run = tracking.start_run(
        experiment_id=experiment_id,
        source_name=project.uri,
        source_version=tracking._get_git_commit(work_dir),
        entry_point_name=entry_point,
        source_type=SourceType.PROJECT)
    if parameters is not None:
        for key, value in parameters.items():
            active_run.log_param(Param(key, value))
    # Add the run id into a magic environment variable that the subprocess will read,
    # causing it to reuse the run.
    exp_id = experiment_id or tracking._get_experiment_id()
    env_map = {
        tracking._RUN_NAME_ENV_VAR: active_run.run_info.run_uuid,
        tracking._TRACKING_URI_ENV_VAR: tracking.get_tracking_uri(),
        tracking._EXPERIMENT_ID_ENV_VAR: str(exp_id),
    }

    commands.append(run_project_command)
    command = " && ".join(commands)
    eprint("=== Running command: %s ===" % command)
    try:
        process.exec_cmd([os.environ.get("SHELL", "bash"), "-c", command],
                         cwd=work_dir,
                         stream_output=True,
                         env=env_map)
        tracking.end_run()
        eprint("=== Run succeeded ===")
    except process.ShellCommandException:
        tracking.end_run("FAILED")
        eprint("=== Run failed ===")
示例#12
0
def _already_ran(entry_point_name,
                 parameters,
                 source_version,
                 experiment_id=None):
    """Best-effort detection of if a run with the given entrypoint name,
    parameters, and experiment id already ran. The run must have completed
    successfully and have at least the parameters provided.
    """
    experiment_id = experiment_id if experiment_id is not None else _get_experiment_id(
    )
    client = mlflow.tracking.MlflowClient()
    all_run_infos = reversed(client.list_run_infos(experiment_id))
    for run_info in all_run_infos:
        if run_info.entry_point_name != entry_point_name:
            continue

        full_run = client.get_run(run_info.run_uuid)
        run_params = _get_params(full_run)
        match_failed = False
        for param_key, param_value in six.iteritems(parameters):
            run_value = run_params.get(param_key)
            if run_value != param_value:
                match_failed = True
                break
        if match_failed:
            continue

        if run_info.status != RunStatus.FINISHED:
            eprint(("Run matched, but is not FINISHED, so skipping "
                    "(run_id=%s, status=%s)") %
                   (run_info.run_uuid, run_info.status))
            continue
        if run_info.source_version != source_version:
            eprint((
                "Run matched, but has a different source version, so skipping "
                "(found=%s, expected=%s)") %
                   (run_info.source_version, source_version))
            continue
        return client.get_run(run_info.run_uuid)
    return None
示例#13
0
def run_local(model_path, run_id=None, port=5000, image=DEFAULT_IMAGE_NAME):
    """
    Serve model locally in a SageMaker compatible Docker container.
    :param model_path:  Path to the model.
    Either local if no run_id or MLflow-relative if run_id is specified)
    :param run_id: MLflow RUN-ID.
    :param port: local port
    :param image: name of the Docker image to be used.
    """
    if run_id:
        model_path = _get_model_log_dir(model_path, run_id)
    _check_compatible(model_path)
    model_path = os.path.abspath(model_path)
    eprint("launching docker image with path {}".format(model_path))
    cmd = ["docker", "run", "-v", "{}:/opt/ml/model/".format(model_path), "-p", "%d:8080" % port,
           "--rm", image, "serve"]
    eprint('executing', ' '.join(cmd))
    proc = Popen(cmd, stdout=PIPE, stderr=STDOUT, universal_newlines=True)

    def _sigterm_handler(*_):
        eprint("received termination signal => killing docker process")
        proc.send_signal(signal.SIGINT)

    import signal
    signal.signal(signal.SIGTERM, _sigterm_handler)
    for x in iter(proc.stdout.readline, ""):
        eprint(x, end='')
示例#14
0
def _get_default_s3_bucket(region_name):
    # create bucket if it does not exist
    sess = boto3.Session()
    account_id = _get_account_id()
    bucket_name = "{pfx}-{rn}-{aid}".format(pfx=DEFAULT_BUCKET_NAME_PREFIX,
                                            rn=region_name,
                                            aid=account_id)
    s3 = sess.client('s3')
    response = s3.list_buckets()
    buckets = [b['Name'] for b in response["Buckets"]]
    if bucket_name not in buckets:
        eprint("Default bucket `%s` not found. Creating..." % bucket_name)
        bucket_creation_kwargs = {
            'ACL': 'bucket-owner-full-control',
            'Bucket': bucket_name,
        }
        if region_name != "us-east-1":
            # The location constraint is required during bucket creation for all regions
            # outside of us-east-1. This constraint cannot be specified in us-east-1;
            # specifying it in this region results in a failure, so we will only
            # add it if we are deploying outside of us-east-1.
            # See https://docs.aws.amazon.com/cli/latest/reference/s3api/create-bucket.html#examples
            bucket_creation_kwargs['CreateBucketConfiguration'] = {
                'LocationConstraint': region_name
            }
        response = s3.create_bucket(**bucket_creation_kwargs)
        eprint(response)
    else:
        eprint("Default bucket `%s` already exists. Skipping creation." %
               bucket_name)
    return bucket_name
示例#15
0
文件: fluent.py 项目: zied2/mlflow
def register_model(model_uri, name):
    """
    Create a new model version in model registry for the model files specified by ``model_uri``.
    Note that this method assumes the model registry backend URI is the same as that of the
    tracking backend.
    :param model_uri: URI referring to the MLmodel directory. Use a ``runs:/`` URI if you want to
                      record the run ID with the model in model registry. ``models:/`` URIs are
                      currently not supported.
    :param name: Name of the registered model under which to create a new model version. If a
                 registered model with the given name does not exist, it will be created
                 automatically.
    :return: Single :py:class:`mlflow.entities.model_registry.ModelVersion` object created by
             backend.
    """
    client = MlflowClient()
    try:
        create_model_response = client.create_registered_model(name)
        eprint("Successfully registered model '%s'." % create_model_response.name)
    except MlflowException as e:
        if e.error_code == ErrorCode.Name(RESOURCE_ALREADY_EXISTS):
            eprint("Registered model '%s' already exists. Creating a new version of this model..."
                   % name)
        else:
            raise e

    if RunsArtifactRepository.is_runs_uri(model_uri):
        source = RunsArtifactRepository.get_underlying_uri(model_uri)
        (run_id, _) = RunsArtifactRepository.parse_runs_uri(model_uri)
        create_version_response = client.create_model_version(name, source, run_id)
    else:
        create_version_response = client.create_model_version(name, source=model_uri, run_id=None)
    eprint("Created version '{version}' of model '{model_name}'.".format(
        version=create_version_response.version, model_name=create_version_response.get_name()))
    return create_version_response
示例#16
0
def ui(backend_store_uri, default_artifact_root, port, host):
    """
    Launch the MLflow tracking UI for local viewing of run results. To launch a production
    server, use the "mlflow server" command instead.

    The UI will be visible at http://localhost:5000 by default, and only accept connections
    from the local machine. To let the UI server accept connections from other machines, you will
    need to pass ``--host 0.0.0.0`` to listen on all network interfaces (or a specific interface
    address).
    """
    from mlflow.server import _run_server
    from mlflow.server.handlers import initialize_backend_stores

    # Ensure that both backend_store_uri and default_artifact_uri are set correctly.
    if not backend_store_uri:
        backend_store_uri = DEFAULT_LOCAL_FILE_AND_ARTIFACT_PATH

    if not default_artifact_root:
        if is_local_uri(backend_store_uri):
            default_artifact_root = backend_store_uri
        else:
            default_artifact_root = DEFAULT_LOCAL_FILE_AND_ARTIFACT_PATH

    try:
        initialize_backend_stores(backend_store_uri, default_artifact_root)
    except Exception as e:  # pylint: disable=broad-except
        _logger.error("Error initializing backend store")
        _logger.exception(e)
        sys.exit(1)

    # TODO: We eventually want to disable the write path in this version of the server.
    try:
        _run_server(backend_store_uri, default_artifact_root, host, port, None,
                    1)
    except ShellCommandException:
        eprint(
            "Running the mlflow server failed. Please see the logs above for details."
        )
        sys.exit(1)
示例#17
0
def http_request(hostname,
                 endpoint,
                 method,
                 headers=None,
                 req_body_json=None,
                 params=None,
                 secure_verify=True,
                 retries=3,
                 retry_interval=3):
    """
    Makes an HTTP request with the specified method to the specified hostname/endpoint. Retries
    up to `retries` times if a request fails with a server error (e.g. error code 500), waiting
    `retry_interval` seconds between successive retries. Parses the API response (assumed to be
    JSON) into a Python object and returns it.

    :param headers: Request headers to use when making the HTTP request
    :param req_body_json: Dictionary containing the request body
    :param params: Query parameters for the request
    :return: Parsed API response
    """
    url = "%s%s" % (hostname, endpoint)
    for i in range(retries):
        response = requests.request(method=method,
                                    url=url,
                                    headers=headers,
                                    verify=secure_verify,
                                    params=params,
                                    json=req_body_json)
        if response.status_code >= 200 and response.status_code < 500:
            return json.loads(response.text)
        else:
            eprint(
                "API request to %s failed with code %s != 200, retrying up to %s more times. "
                "API response body: %s" %
                (url, response.status_code, retries - i - 1, response.text))
            time.sleep(retry_interval)
    raise Exception(
        "API request to %s failed to return code 200 after %s tries" %
        (url, retries))
示例#18
0
def _deploy(role, image, app_name, model_s3_path, run_id, region_name):
    """
    Deploy model on sagemaker.
    :param role: SageMaker execution ARN role
    :param image: Name of the Docker image the model is being deployed into
    :param app_name: Name of the deployed app
    :param model_s3_path: s3 path where we stored the model artifacts
    :param run_id: RunId that generated this model
    """
    sage_client = boto3.client('sagemaker', region_name=region_name)
    ecr_client = boto3.client("ecr")
    repository_conf = ecr_client.describe_repositories(
        repositoryNames=[image])['repositories'][0]
    model_name = app_name + '-model'
    model_response = sage_client.create_model(
        ModelName=model_name,
        PrimaryContainer={
            'ContainerHostname': 'mlflow-serve-%s' % model_name,
            'Image': repository_conf["repositoryUri"],
            'ModelDataUrl': model_s3_path,
            'Environment': {},
        },
        ExecutionRoleArn=role,
        Tags=[
            {
                'Key': 'run_id',
                'Value': str(run_id)
            },
        ],
    )
    eprint("model_arn: %s" % model_response["ModelArn"])
    config_name = app_name + "-config"
    endpoint_config_response = sage_client.create_endpoint_config(
        EndpointConfigName=config_name,
        ProductionVariants=[
            {
                'VariantName': 'model1',
                'ModelName':
                model_name,  # is this the unique identifier for Model?
                'InitialInstanceCount': 1,
                'InstanceType': 'ml.m4.xlarge',
                'InitialVariantWeight': 1,
            },
        ],
        Tags=[
            {
                'Key': 'app_name',
                'Value': app_name,
            },
        ],
    )
    eprint("endpoint_config_arn: %s" %
           endpoint_config_response["EndpointConfigArn"])
    endpoint_response = sage_client.create_endpoint(
        EndpointName=app_name,
        EndpointConfigName=config_name,
        Tags=[],
    )
    eprint("endpoint_arn: %s" % endpoint_response["EndpointArn"])
示例#19
0
def _fetch_project(uri,
                   force_tempdir,
                   version=None,
                   git_username=None,
                   git_password=None):
    """
    Fetch a project into a local directory, returning the path to the local project directory.
    :param force_tempdir: If True, will fetch the project into a temporary directory. Otherwise,
                          will fetch Git projects into a temporary directory but simply return the
                          path of local projects (i.e. perform a no-op for local projects).
    """
    parsed_uri, subdirectory = _parse_subdirectory(uri)
    use_temp_dst_dir = force_tempdir or not _is_local_path(uri)
    dst_dir = tempfile.mkdtemp(
    ) if use_temp_dst_dir else urllib.parse.urlparse(uri).path
    if use_temp_dst_dir:
        eprint("=== Fetching project from %s into %s ===" % (uri, dst_dir))
    # Download a project to the target `dst_dir` from a Git URI or local path.
    if _GIT_URI_REGEX.match(uri):
        # Use Git to clone the project
        _fetch_git_repo(parsed_uri, version, dst_dir, git_username,
                        git_password)
    else:
        if version is not None:
            raise ExecutionException(
                "Setting a version is only supported for Git project URIs")
        if use_temp_dst_dir:
            dir_util.copy_tree(src=parsed_uri, dst=dst_dir)

    # Make sure there is a MLproject file in the specified working directory.
    if not os.path.isfile(os.path.join(dst_dir, subdirectory, "MLproject")):
        if subdirectory == '':
            raise ExecutionException("No MLproject file found in %s" % uri)
        else:
            raise ExecutionException(
                "No MLproject file found in subdirectory %s of %s" %
                (subdirectory, parsed_uri))

    return os.path.abspath(os.path.join(dst_dir, subdirectory))
示例#20
0
文件: cli.py 项目: vivlet/mlflow
def ui(backend_store_uri, default_artifact_root, host, port, gunicorn_opts):
    """
    Launch the MLflow tracking UI.

    The UI will be visible at http://localhost:5000 by default.
    """
    # Ensure that both backend_store_uri and default_artifact_uri are set correctly.
    if not backend_store_uri:
        backend_store_uri = DEFAULT_LOCAL_FILE_AND_ARTIFACT_PATH

    if not default_artifact_root:
        if _is_local_uri(backend_store_uri):
            default_artifact_root = backend_store_uri
        else:
            default_artifact_root = DEFAULT_LOCAL_FILE_AND_ARTIFACT_PATH

    # TODO: We eventually want to disable the write path in this version of the server.
    try:
        _run_server(backend_store_uri, default_artifact_root, host, port, 1, None, gunicorn_opts)
    except ShellCommandException:
        eprint("Running the mlflow server failed. Please see the logs above for details.")
        sys.exit(1)
示例#21
0
def _build_image(image_name,
                 entrypoint,
                 mlflow_home=None,
                 custom_setup_steps_hook=None):
    """
    Build an MLflow Docker image that can be used to serve a
    The image is built locally and it requires Docker to run.

    :param image_name: Docker image name.
    :param entry_point: String containing ENTRYPOINT directive for docker image
    :param mlflow_home: (Optional) Path to a local copy of the MLflow GitHub repository.
                        If specified, the image will install MLflow from this directory.
                        If None, it will install MLflow from pip.
    :param custom_setup_steps_hook: (Optional) Single-argument function that takes the string path
           of a dockerfile context directory and returns a string containing Dockerfile commands to
           run during the image build step.
    """
    mlflow_home = os.path.abspath(mlflow_home) if mlflow_home else None
    with TempDir() as tmp:
        cwd = tmp.path()
        install_mlflow = _get_mlflow_install_step(cwd, mlflow_home)
        custom_setup_steps = custom_setup_steps_hook(
            cwd) if custom_setup_steps_hook else ""
        with open(os.path.join(cwd, "Dockerfile"), "w") as f:
            f.write(
                _DOCKERFILE_TEMPLATE.format(
                    install_mlflow=install_mlflow,
                    custom_setup_steps=custom_setup_steps,
                    entrypoint=entrypoint))
        _logger.info("Building docker image with name %s", image_name)
        os.system('find {cwd}/'.format(cwd=cwd))
        proc = Popen(
            ["docker", "build", "-t", image_name, "-f", "Dockerfile", "."],
            cwd=cwd,
            stdout=PIPE,
            stderr=STDOUT,
            universal_newlines=True)
        for x in iter(proc.stdout.readline, ""):
            eprint(x, end='')
示例#22
0
def _user_args_to_dict(arguments, argument_type="P"):
    user_dict = {}
    for arg in arguments:
        split = arg.split("=", maxsplit=1)
        # Docker arguments such as `t` don't require a value -> set to True if specified
        if len(split) == 1 and argument_type == "A":
            name = split[0]
            value = True
        elif len(split) == 2:
            name = split[0]
            value = split[1]
        else:
            eprint(
                "Invalid format for -%s parameter: '%s'. "
                "Use -%s name=value." % (argument_type, arg, argument_type)
            )
            sys.exit(1)
        if name in user_dict:
            eprint("Repeated parameter: '%s'" % name)
            sys.exit(1)
        user_dict[name] = value
    return user_dict
示例#23
0
 def patch_impl(original):
     eprint("patch1")
     logger.info("patch2")
     warnings.warn_explicit("preamble MLflow warning",
                            category=Warning,
                            filename=mlflow.__file__,
                            lineno=5)
     warnings.warn_explicit("preamble numpy warning",
                            category=UserWarning,
                            filename=np.__file__,
                            lineno=7)
     original()
     warnings.warn_explicit("postamble MLflow warning",
                            category=Warning,
                            filename=mlflow.__file__,
                            lineno=10)
     warnings.warn_explicit("postamble numpy warning",
                            category=Warning,
                            filename=np.__file__,
                            lineno=14)
     logger.warning("patch3")
     logger.critical("patch4")
示例#24
0
def _run_shell_command_job(project_uri, command, env_vars, cluster_spec):
    """
    Runs the specified shell command on a Databricks cluster.
    :param project_uri: URI of the project from which our shell command originates
    :param command: Shell command to run
    :param env_vars: Environment variables to set in the process running `command`
    :param cluster_spec: Dictionary describing the cluster, expected to contain the fields for a
                         NewCluster (see
                         https://docs.databricks.com/api/latest/jobs.html#jobsclusterspecnewcluster)
    :return: The ID of the Databricks Job Run. Can be used to query the run's status via the
             Databricks Runs Get API (https://docs.databricks.com/api/latest/jobs.html#runs-get).
    """
    # Make jobs API request to launch run.
    req_body_json = {
        'run_name': 'MLflow Run for %s' % project_uri,
        'new_cluster': cluster_spec,
        'shell_command_task': {
            'command': command,
            "env_vars": env_vars
        },
        # NB: We use <= on the version specifier to allow running projects on pre-release
        # versions, where we will select the most up-to-date mlflow version available.
        # Also note, that we escape this so '<' is not treated as a shell pipe.
        "libraries": [{
            "pypi": {
                "package": "'mlflow<=%s'" % VERSION
            }
        }]
    }
    run_submit_res = _jobs_runs_submit(req_body_json)
    databricks_run_id = run_submit_res["run_id"]
    eprint(
        "=== Launched MLflow run as Databricks job run with ID %s. Getting run status "
        "page URL... ===" % databricks_run_id)
    run_info = _jobs_runs_get(databricks_run_id)
    jobs_page_url = run_info["run_page_url"]
    eprint("=== Check the run's status at %s ===" % jobs_page_url)
    return databricks_run_id
示例#25
0
def push_image_to_ecr(image=DEFAULT_IMAGE_NAME):
    """
    Push local Docker image to AWS ECR.

    The image is pushed under currently active AWS account and to the currently active AWS region.

    :param image: Docker image name.
    """
    eprint("Pushing image to ECR")
    client = boto3.client("sts")
    caller_id = client.get_caller_identity()
    account = caller_id['Account']
    my_session = boto3.session.Session()
    region = my_session.region_name or "us-west-2"
    fullname = _full_template.format(account=account,
                                     region=region,
                                     image=image,
                                     version=mlflow.version.VERSION)
    eprint("Pushing docker image {image} to {repo}".format(image=image,
                                                           repo=fullname))
    ecr_client = boto3.client('ecr')
    try:
        ecr_client.describe_repositories(
            repositoryNames=[image])['repositories']
    except ecr_client.exceptions.RepositoryNotFoundException:
        ecr_client.create_repository(repositoryName=image)
        print("Created new ECR repository: {repository_name}".format(
            repository_name=image))
    # TODO: it would be nice to translate the docker login, tag and push to python api.
    # x = ecr_client.get_authorization_token()['authorizationData'][0]
    # docker_login_cmd = "docker login -u AWS -p {token} {url}".format(token=x['authorizationToken']
    #                                                                ,url=x['proxyEndpoint'])
    docker_login_cmd = "$(aws ecr get-login --no-include-email)"
    docker_tag_cmd = "docker tag {image} {fullname}".format(image=image,
                                                            fullname=fullname)
    docker_push_cmd = "docker push {}".format(fullname)
    cmd = ";\n".join([docker_login_cmd, docker_tag_cmd, docker_push_cmd])
    os.system(cmd)
示例#26
0
def http_request(host_creds, endpoint, retries=3, retry_interval=3, **kwargs):
    """
    Makes an HTTP request with the specified method to the specified hostname/endpoint. Retries
    up to `retries` times if a request fails with a server error (e.g. error code 500), waiting
    `retry_interval` seconds between successive retries. Parses the API response (assumed to be
    JSON) into a Python object and returns it.

    :param host_creds: A :py:class:`mlflow.rest_utils.MlflowHostCreds` object containing
        hostname and optional authentication.
    :return: Parsed API response
    """
    hostname = host_creds.host
    auth_str = None
    if host_creds.username and host_creds.password:
        basic_auth_str = ("%s:%s" % (host_creds.username, host_creds.password)).encode("utf-8")
        auth_str = "Basic " + base64.standard_b64encode(basic_auth_str).decode("utf-8")
    elif host_creds.token:
        auth_str = "Bearer %s" % host_creds.token

    headers = {}
    if auth_str:
        headers['Authorization'] = auth_str

    verify = not host_creds.ignore_tls_verification

    cleaned_hostname = strip_suffix(hostname, '/')
    url = "%s%s" % (cleaned_hostname, endpoint)
    for i in range(retries):
        response = requests.request(url=url, headers=headers, verify=verify, **kwargs)
        if response.status_code >= 200 and response.status_code < 500:
            return response
        else:
            eprint("API request to %s failed with code %s != 200, retrying up to %s more times. "
                   "API response body: %s" % (url, response.status_code, retries - i - 1,
                                              response.text))
            time.sleep(retry_interval)
    raise MlflowException("API request to %s failed to return code 200 after %s tries" %
                          (url, retries))
示例#27
0
def _run_project(project, entry_point, work_dir, parameters, use_conda, storage_dir,
                 experiment_id, block):
    """Locally run a project that has been checked out in `work_dir`."""
    storage_dir_for_run = _get_storage_dir(storage_dir)
    eprint("=== Created directory %s for downloading remote URIs passed to arguments of "
           "type 'path' ===" % storage_dir_for_run)
    # Try to build the command first in case the user mis-specified parameters
    run_project_command = project.get_entry_point(entry_point)\
        .compute_command(parameters, storage_dir_for_run)
    commands = []
    if use_conda:
        conda_env_path = os.path.abspath(os.path.join(work_dir, project.conda_env))
        _maybe_create_conda_env(conda_env_path)
        commands.append("source activate %s" % _get_conda_env_name(conda_env_path))

    # Create a new run and log every provided parameter into it.
    active_run = tracking._create_run(
        experiment_id=experiment_id, source_name=project.uri,
        source_version=tracking._get_git_commit(work_dir), entry_point_name=entry_point,
        source_type=SourceType.PROJECT)
    if parameters is not None:
        for key, value in parameters.items():
            active_run.log_param(Param(key, value))
    # Add the run id into a magic environment variable that the subprocess will read,
    # causing it to reuse the run.
    env_map = {
        tracking._RUN_ID_ENV_VAR: active_run.run_info.run_uuid,
        tracking._TRACKING_URI_ENV_VAR: tracking.get_tracking_uri(),
        tracking._EXPERIMENT_ID_ENV_VAR: str(experiment_id),
    }

    commands.append(run_project_command)
    command = " && ".join(commands)
    eprint("=== Running command '%s' in run with ID '%s' === "
           % (command, active_run.run_info.run_uuid))

    return _launch_local_run(
        active_run, command, work_dir, env_map, stream_output=block)
示例#28
0
def server(backend_store_uri, default_artifact_root, host, port,
           workers, static_prefix, gunicorn_opts, waitress_opts):
    """
    Run the MLflow tracking server.

    The server which listen on http://localhost:5000 by default, and only accept connections
    from the local machine. To let the server accept connections from other machines, you will need
    to pass ``--host 0.0.0.0`` to listen on all network interfaces
    (or a specific interface address).
    """

    _validate_server_args(gunicorn_opts=gunicorn_opts, workers=workers, waitress_opts=waitress_opts)

    # Ensure that both backend_store_uri and default_artifact_uri are set correctly.
    if not backend_store_uri:
        backend_store_uri = DEFAULT_LOCAL_FILE_AND_ARTIFACT_PATH

    if not default_artifact_root:
        if _is_local_uri(backend_store_uri):
            default_artifact_root = backend_store_uri
        else:
            eprint("Option 'default-artifact-root' is required, when backend store is not "
                   "local file based.")
            sys.exit(1)

    try:
        _get_store(backend_store_uri, default_artifact_root)
    except Exception as e:  # pylint: disable=broad-except
        _logger.error("Error initializing backend store")
        _logger.exception(e)
        sys.exit(1)

    try:
        _run_server(backend_store_uri, default_artifact_root, host, port,
                    static_prefix, workers, gunicorn_opts, waitress_opts)
    except ShellCommandException:
        eprint("Running the mlflow server failed. Please see the logs above for details.")
        sys.exit(1)
示例#29
0
 def run_databricks(self, uri, entry_point, work_dir, parameters,
                    experiment_id, cluster_spec, run_id):
     tracking_uri = _get_tracking_uri_for_run()
     dbfs_fuse_uri = self._upload_project_to_dbfs(work_dir, experiment_id)
     env_vars = {
         tracking._TRACKING_URI_ENV_VAR: tracking_uri,
         tracking._EXPERIMENT_ID_ENV_VAR: experiment_id,
     }
     eprint("=== Running entry point %s of project %s on Databricks ===" %
            (entry_point, uri))
     # Launch run on Databricks
     with open(cluster_spec, 'r') as handle:
         try:
             cluster_spec = json.load(handle)
         except ValueError:
             eprint(
                 "Error when attempting to load and parse JSON cluster spec from file "
                 "%s. " % cluster_spec)
             raise
     command = _get_databricks_run_cmd(dbfs_fuse_uri, run_id, entry_point,
                                       parameters)
     return self._run_shell_command_job(uri, command, env_vars,
                                        cluster_spec)
示例#30
0
def _invoke_mlflow_run_subprocess(work_dir, entry_point, parameters,
                                  experiment_id, use_conda, storage_dir,
                                  run_id):
    """
    Run an MLflow project asynchronously by invoking ``mlflow run`` in a subprocess, returning
    a SubmittedRun that can be used to query run status.
    """
    eprint("=== Asynchronously launching MLflow run with ID %s ===" % run_id)
    # Add the run id into a magic environment variable that the subprocess will read,
    # causing it to reuse the run.
    env_map = {
        tracking._RUN_ID_ENV_VAR: run_id,
        tracking._TRACKING_URI_ENV_VAR: tracking.get_tracking_uri(),
        tracking._EXPERIMENT_ID_ENV_VAR: str(experiment_id),
    }
    mlflow_run_arr = _build_mlflow_run_cmd(uri=work_dir,
                                           entry_point=entry_point,
                                           storage_dir=storage_dir,
                                           use_conda=use_conda,
                                           run_id=run_id,
                                           parameters=parameters)
    mlflow_run_subprocess = _run_mlflow_run_cmd(mlflow_run_arr, env_map)
    return LocalSubmittedRun(run_id, mlflow_run_subprocess)