def _run_local(uri, entry_point, version, parameters, experiment_id, use_conda, use_temp_cwd, storage_dir, git_username, git_password): """ Run an MLflow project from the given URI in a new directory. Supports downloading projects from Git URIs with a specified version, or copying them from the file system. For Git-based projects, a commit can be specified as the `version`. """ eprint("=== Fetching project from %s ===" % uri) # Get the working directory to use for running the project & download it there work_dir = _get_work_dir(uri, use_temp_cwd) eprint("=== Work directory for this run: %s ===" % work_dir) expanded_uri = _expand_uri(uri) _fetch_project(expanded_uri, version, work_dir, git_username, git_password) # Load the MLproject file if not os.path.isfile(os.path.join(work_dir, "MLproject")): raise ExecutionException("No MLproject file found in %s" % uri) project = Project(expanded_uri, file_utils.read_yaml(work_dir, "MLproject")) _run_project(project, entry_point, work_dir, parameters, use_conda, storage_dir, experiment_id)
def _create_databricks_run(tracking_uri, experiment_id, source_name, source_version, entry_point_name): """ Makes an API request to the specified tracking server to create a new run with the specified attributes. Returns an `ActiveRun` that can be used to query the tracking server for the run's status or log metrics/params for the run. """ if tracking.is_local_uri(tracking_uri): # TODO: we'll actually use the Databricks deployment's tracking URI here in the future eprint( "WARNING: MLflow tracking URI is set to a local URI (%s), so results from " "Databricks will not be logged permanently." % tracking_uri) return None else: # Assume non-local tracking URIs are accessible from Databricks (won't work for e.g. # localhost) return tracking._create_run(experiment_id=experiment_id, source_name=source_name, source_version=source_version, entry_point_name=entry_point_name, source_type=SourceType.PROJECT)
def _wait_for(submitted_run_obj): """Waits on the passed-in submitted run, reporting its status to the tracking server.""" run_id = submitted_run_obj.run_id active_run = None # Note: there's a small chance we fail to report the run's status to the tracking server if # we're interrupted before we reach the try block below try: active_run = tracking._get_existing_run( run_id) if run_id is not None else None if submitted_run_obj.wait(): eprint("=== Run (ID '%s') succeeded ===" % run_id) _maybe_set_run_terminated(active_run, "FINISHED") else: _maybe_set_run_terminated(active_run, "FAILED") raise ExecutionException("=== Run (ID '%s') failed ===" % run_id) except KeyboardInterrupt: eprint("=== Run (ID '%s') === interrupted, cancelling run ===" % run_id) submitted_run_obj.cancel() _maybe_set_run_terminated(active_run, "FAILED") raise
def _get_entry_point_command(project, entry_point, parameters, conda_env_name, storage_dir): """ Returns the shell command to execute in order to run the specified entry point. :param project: Project containing the target entry point :param entry_point: Entry point to run :param parameters: Parameters (dictionary) for the entry point command :param conda_env_name: Name of conda environment to use for command execution, or None if no conda environment should be used. :param storage_dir: Base local directory to use for downloading remote artifacts passed to arguments of type 'path'. If None, a temporary base directory is used. """ storage_dir_for_run = _get_storage_dir(storage_dir) eprint("=== Created directory %s for downloading remote URIs passed to arguments of " "type 'path' ===" % storage_dir_for_run) commands = [] if conda_env_name: activate_path = _get_conda_bin_executable("activate") commands.append("source %s %s" % (activate_path, conda_env_name)) commands.append( project.get_entry_point(entry_point).compute_command(parameters, storage_dir_for_run)) return " && ".join(commands)
def _already_ran(entry_point_name, parameters, git_commit, experiment_id=None): experiment_id = experiment_id if experiment_id is not None else _get_experiment_id() client = mlflow.tracking.MlflowClient() all_run_infos = reversed(client.list_run_infos(experiment_id)) for run_info in all_run_infos: full_run = client.get_run(run_info.run_id) tags = full_run.data.tags if tags.get(mlflow_tags.MLFLOW_PROJECT_ENTRY_POINT, None) != entry_point_name: continue match_failed = False for param_key, param_value in six.iteritems(parameters): run_value = full_run.data.params.get(param_key) if run_value != param_value: match_failed = True break if match_failed: continue if run_info.to_proto().status != RunStatus.FINISHED: eprint(("Run matched, but is not FINISHED, so skipping " "(run_id=%s, status=%s)") % (run_info.run_id, run_info.status)) continue previous_version = tags.get(mlflow_tags.MLFLOW_GIT_COMMIT, None) if git_commit != previous_version: eprint(("Run matched, but has a different source version, so skipping " "(found=%s, expected=%s)") % (previous_version, git_commit)) continue return client.get_run(run_info.run_id) eprint("No matching run has been found.") return None
def load_model(path, run_id=None, dfs_tmpdir=None): """ Load the Spark MLlib model from the path. :param run_id: Run ID. If provided, combined with ``path`` to identify the model. :param path: Local filesystem path or run-relative artifact path to the model. :return: SparkML model. :rtype: pyspark.ml.pipeline.PipelineModel >>> from mlflow import spark >>> model = mlflow.spark.load_model("spark-model") >>> # Prepare test documents, which are unlabeled (id, text) tuples. >>> test = spark.createDataFrame([ ... (4, "spark i j k"), ... (5, "l m n"), ... (6, "spark hadoop spark"), ... (7, "apache hadoop")], ["id", "text"]) >>> # Make predictions on test documents. >>> prediction = model.transform(test) """ dfs_tmpdir = dfs_tmpdir if dfs_tmpdir is not None else DFS_TMP if run_id is not None: path = mlflow.tracking.utils._get_model_log_dir(model_name=path, run_id=run_id) m = Model.load(os.path.join(path, 'MLmodel')) if FLAVOR_NAME not in m.flavors: raise Exception("Model does not have {} flavor".format(FLAVOR_NAME)) conf = m.flavors[FLAVOR_NAME] model_path = os.path.join(path, conf['model_data']) tmp_path = _tmp_path(dfs_tmpdir) # Spark ML expects the model to be stored on DFS # Copy the model to a temp DFS location first. We cannot delete this file, as # Spark may read from it at any point. _HadoopFileSystem.copy_from_local_file(model_path, tmp_path, removeSrc=False) pipeline_model = PipelineModel.load(tmp_path) eprint("Copied SparkML model to %s" % tmp_path) return pipeline_model
def run_databricks(uri, entry_point, version, parameters, experiment_id, cluster_spec, git_username, git_password): """ Runs the project at the specified URI on Databricks, returning a `SubmittedRun` that can be used to query the run's status or wait for the resulting Databricks Job run to terminate. """ _check_databricks_auth_available() if cluster_spec is None: raise ExecutionException( "Cluster spec must be provided when launching MLflow project runs " "on Databricks.") # Fetch the project into work_dir & validate parameters work_dir = _fetch_project(uri=uri, use_temp_cwd=True, version=version, git_username=git_username, git_password=git_password) project = _load_project(work_dir) project.get_entry_point(entry_point)._validate_parameters(parameters) # Upload the project to DBFS, get the URI of the project dbfs_project_uri = _upload_project_to_dbfs(work_dir, experiment_id) # Create run object with remote tracking server. Get the git commit from the working directory, # etc. tracking_uri = tracking.get_tracking_uri() remote_run = _create_databricks_run( tracking_uri=tracking_uri, experiment_id=experiment_id, source_name=_expand_uri(uri), source_version=tracking._get_git_commit(work_dir), entry_point_name=entry_point) # Set up environment variables for remote execution env_vars = {} if experiment_id is not None: eprint("=== Using experiment ID %s ===" % experiment_id) env_vars[tracking._EXPERIMENT_ID_ENV_VAR] = experiment_id if remote_run is not None: env_vars[tracking._TRACKING_URI_ENV_VAR] = tracking.get_tracking_uri() env_vars[tracking._RUN_ID_ENV_VAR] = remote_run.run_info.run_uuid eprint("=== Running entry point %s of project %s on Databricks. ===" % (entry_point, uri)) # Launch run on Databricks with open(cluster_spec, 'r') as handle: try: cluster_spec = json.load(handle) except ValueError: eprint( "Error when attempting to load and parse JSON cluster spec from file " "%s. " % cluster_spec) raise fuse_dst_dir = os.path.join( "/dbfs/", _parse_dbfs_uri_path(dbfs_project_uri).lstrip("/")) final_run_id = remote_run.run_info.run_uuid if remote_run else None command = _get_databricks_run_cmd(fuse_dst_dir, final_run_id, entry_point, parameters) db_run_id = _run_shell_command_job(uri, command, env_vars, cluster_spec) run_id = remote_run.run_info.run_uuid if remote_run else None return DatabricksSubmittedRun(db_run_id, run_id)
def _create_sagemaker_endpoint(endpoint_name, image_url, model_s3_path, run_id, instance_type, instance_count, role, sage_client): """ :param image_url: URL of the ECR-hosted docker image the model is being deployed into. :param model_s3_path: S3 path where we stored the model artifacts. :param run_id: Run ID that generated this model. :param instance_type: The type of SageMaker ML instance on which to deploy the model. :param instance_count: The number of SageMaker ML instances on which to deploy the model. :param role: SageMaker execution ARN role :param sage_client: A boto3 client for SageMaker """ eprint("Creating new endpoint with name: {en} ...".format( en=endpoint_name)) model_name = _get_sagemaker_model_name(endpoint_name) model_response = _create_sagemaker_model(model_name=model_name, model_s3_path=model_s3_path, run_id=run_id, image_url=image_url, execution_role=role, sage_client=sage_client) eprint("Created model with arn: %s" % model_response["ModelArn"]) production_variant = { 'VariantName': model_name, 'ModelName': model_name, 'InitialInstanceCount': instance_count, 'InstanceType': instance_type, 'InitialVariantWeight': 1, } config_name = _get_sagemaker_config_name(endpoint_name) endpoint_config_response = sage_client.create_endpoint_config( EndpointConfigName=config_name, ProductionVariants=[production_variant], Tags=[ { 'Key': 'app_name', 'Value': endpoint_name, }, ], ) eprint("Created endpoint configuration with arn: %s" % endpoint_config_response["EndpointConfigArn"]) endpoint_response = sage_client.create_endpoint( EndpointName=endpoint_name, EndpointConfigName=config_name, Tags=[], ) eprint("Created endpoint with arn: %s" % endpoint_response["EndpointArn"])
def delete(app_name, region_name="us-west-2", archive=False): """ Delete the specified application. :param app_name: Name of the deployed application. :param region_name: Name of the AWS region in which the application is deployed. :param archive: If True, resources associated with the specified application, such as its associated models and endpoint configuration, will be preserved. If False, these resources will be deleted. """ s3_client = boto3.client('s3', region_name=region_name) sage_client = boto3.client('sagemaker', region_name=region_name) endpoint_info = sage_client.describe_endpoint(EndpointName=app_name) endpoint_arn = endpoint_info["EndpointArn"] sage_client.delete_endpoint(EndpointName=app_name) eprint("Deleted endpoint with arn: {earn}".format(earn=endpoint_arn)) if not archive: config_name = endpoint_info["EndpointConfigName"] config_info = sage_client.describe_endpoint_config( EndpointConfigName=config_name) config_arn = config_info["EndpointConfigArn"] sage_client.delete_endpoint_config(EndpointConfigName=config_name) eprint("Deleted associated endpoint configuration with arn: {carn}". format(carn=config_arn)) for pv in config_info["ProductionVariants"]: model_name = pv["ModelName"] model_arn = _delete_sagemaker_model(model_name, sage_client, s3_client) eprint("Deleted associated model with arn: {marn}".format( marn=model_arn))
def build_image(name=DEFAULT_IMAGE_NAME, mlflow_home=None): """ This function builds an MLflow Docker image. The image is built locally and it requires Docker to run. :param name: image name """ with TempDir() as tmp: install_mlflow = "RUN pip install mlflow=={version}".format( version=mlflow.version.VERSION) cwd = tmp.path() if mlflow_home: mlflow_dir = _copy_project(src_path=mlflow_home, dst_path=tmp.path()) install_mlflow = ("COPY {mlflow_dir} /opt/mlflow\n" "RUN cd /opt/mlflow/mlflow/java/scoring &&" " mvn --batch-mode package -DskipTests \n" "RUN pip install /opt/mlflow\n") install_mlflow = install_mlflow.format(mlflow_dir=mlflow_dir) else: eprint("`mlflow_home` was not specified. The image will install" " MLflow from pip instead. As a result, the container will" " not support the MLeap flavor.") with open(os.path.join(cwd, "Dockerfile"), "w") as f: f.write(_DOCKERFILE_TEMPLATE % install_mlflow) eprint("building docker image") os.system('find {cwd}/'.format(cwd=cwd)) proc = Popen(["docker", "build", "-t", name, "-f", "Dockerfile", "."], cwd=cwd, stdout=PIPE, stderr=STDOUT, universal_newlines=True) for x in iter(proc.stdout.readline, ""): eprint(x, end='')
def _run_project(project, entry_point, work_dir, parameters, use_conda, storage_dir, experiment_id): """Locally run a project that has been checked out in `work_dir`.""" storage_dir_for_run = _get_storage_dir(storage_dir) eprint( "=== Created directory %s for downloading remote URIs passed to arguments of " "type 'path' ===" % storage_dir_for_run) # Try to build the command first in case the user mis-specified parameters run_project_command = project.get_entry_point(entry_point)\ .compute_command(parameters, storage_dir_for_run) commands = [] if use_conda: conda_env_path = os.path.abspath( os.path.join(work_dir, project.conda_env)) _maybe_create_conda_env(conda_env_path) commands.append("source activate %s" % _get_conda_env_name(conda_env_path)) # Create a new run and log every provided parameter into it. active_run = tracking.start_run( experiment_id=experiment_id, source_name=project.uri, source_version=tracking._get_git_commit(work_dir), entry_point_name=entry_point, source_type=SourceType.PROJECT) if parameters is not None: for key, value in parameters.items(): active_run.log_param(Param(key, value)) # Add the run id into a magic environment variable that the subprocess will read, # causing it to reuse the run. exp_id = experiment_id or tracking._get_experiment_id() env_map = { tracking._RUN_NAME_ENV_VAR: active_run.run_info.run_uuid, tracking._TRACKING_URI_ENV_VAR: tracking.get_tracking_uri(), tracking._EXPERIMENT_ID_ENV_VAR: str(exp_id), } commands.append(run_project_command) command = " && ".join(commands) eprint("=== Running command: %s ===" % command) try: process.exec_cmd([os.environ.get("SHELL", "bash"), "-c", command], cwd=work_dir, stream_output=True, env=env_map) tracking.end_run() eprint("=== Run succeeded ===") except process.ShellCommandException: tracking.end_run("FAILED") eprint("=== Run failed ===")
def _already_ran(entry_point_name, parameters, source_version, experiment_id=None): """Best-effort detection of if a run with the given entrypoint name, parameters, and experiment id already ran. The run must have completed successfully and have at least the parameters provided. """ experiment_id = experiment_id if experiment_id is not None else _get_experiment_id( ) client = mlflow.tracking.MlflowClient() all_run_infos = reversed(client.list_run_infos(experiment_id)) for run_info in all_run_infos: if run_info.entry_point_name != entry_point_name: continue full_run = client.get_run(run_info.run_uuid) run_params = _get_params(full_run) match_failed = False for param_key, param_value in six.iteritems(parameters): run_value = run_params.get(param_key) if run_value != param_value: match_failed = True break if match_failed: continue if run_info.status != RunStatus.FINISHED: eprint(("Run matched, but is not FINISHED, so skipping " "(run_id=%s, status=%s)") % (run_info.run_uuid, run_info.status)) continue if run_info.source_version != source_version: eprint(( "Run matched, but has a different source version, so skipping " "(found=%s, expected=%s)") % (run_info.source_version, source_version)) continue return client.get_run(run_info.run_uuid) return None
def run_local(model_path, run_id=None, port=5000, image=DEFAULT_IMAGE_NAME): """ Serve model locally in a SageMaker compatible Docker container. :param model_path: Path to the model. Either local if no run_id or MLflow-relative if run_id is specified) :param run_id: MLflow RUN-ID. :param port: local port :param image: name of the Docker image to be used. """ if run_id: model_path = _get_model_log_dir(model_path, run_id) _check_compatible(model_path) model_path = os.path.abspath(model_path) eprint("launching docker image with path {}".format(model_path)) cmd = ["docker", "run", "-v", "{}:/opt/ml/model/".format(model_path), "-p", "%d:8080" % port, "--rm", image, "serve"] eprint('executing', ' '.join(cmd)) proc = Popen(cmd, stdout=PIPE, stderr=STDOUT, universal_newlines=True) def _sigterm_handler(*_): eprint("received termination signal => killing docker process") proc.send_signal(signal.SIGINT) import signal signal.signal(signal.SIGTERM, _sigterm_handler) for x in iter(proc.stdout.readline, ""): eprint(x, end='')
def _get_default_s3_bucket(region_name): # create bucket if it does not exist sess = boto3.Session() account_id = _get_account_id() bucket_name = "{pfx}-{rn}-{aid}".format(pfx=DEFAULT_BUCKET_NAME_PREFIX, rn=region_name, aid=account_id) s3 = sess.client('s3') response = s3.list_buckets() buckets = [b['Name'] for b in response["Buckets"]] if bucket_name not in buckets: eprint("Default bucket `%s` not found. Creating..." % bucket_name) bucket_creation_kwargs = { 'ACL': 'bucket-owner-full-control', 'Bucket': bucket_name, } if region_name != "us-east-1": # The location constraint is required during bucket creation for all regions # outside of us-east-1. This constraint cannot be specified in us-east-1; # specifying it in this region results in a failure, so we will only # add it if we are deploying outside of us-east-1. # See https://docs.aws.amazon.com/cli/latest/reference/s3api/create-bucket.html#examples bucket_creation_kwargs['CreateBucketConfiguration'] = { 'LocationConstraint': region_name } response = s3.create_bucket(**bucket_creation_kwargs) eprint(response) else: eprint("Default bucket `%s` already exists. Skipping creation." % bucket_name) return bucket_name
def register_model(model_uri, name): """ Create a new model version in model registry for the model files specified by ``model_uri``. Note that this method assumes the model registry backend URI is the same as that of the tracking backend. :param model_uri: URI referring to the MLmodel directory. Use a ``runs:/`` URI if you want to record the run ID with the model in model registry. ``models:/`` URIs are currently not supported. :param name: Name of the registered model under which to create a new model version. If a registered model with the given name does not exist, it will be created automatically. :return: Single :py:class:`mlflow.entities.model_registry.ModelVersion` object created by backend. """ client = MlflowClient() try: create_model_response = client.create_registered_model(name) eprint("Successfully registered model '%s'." % create_model_response.name) except MlflowException as e: if e.error_code == ErrorCode.Name(RESOURCE_ALREADY_EXISTS): eprint("Registered model '%s' already exists. Creating a new version of this model..." % name) else: raise e if RunsArtifactRepository.is_runs_uri(model_uri): source = RunsArtifactRepository.get_underlying_uri(model_uri) (run_id, _) = RunsArtifactRepository.parse_runs_uri(model_uri) create_version_response = client.create_model_version(name, source, run_id) else: create_version_response = client.create_model_version(name, source=model_uri, run_id=None) eprint("Created version '{version}' of model '{model_name}'.".format( version=create_version_response.version, model_name=create_version_response.get_name())) return create_version_response
def ui(backend_store_uri, default_artifact_root, port, host): """ Launch the MLflow tracking UI for local viewing of run results. To launch a production server, use the "mlflow server" command instead. The UI will be visible at http://localhost:5000 by default, and only accept connections from the local machine. To let the UI server accept connections from other machines, you will need to pass ``--host 0.0.0.0`` to listen on all network interfaces (or a specific interface address). """ from mlflow.server import _run_server from mlflow.server.handlers import initialize_backend_stores # Ensure that both backend_store_uri and default_artifact_uri are set correctly. if not backend_store_uri: backend_store_uri = DEFAULT_LOCAL_FILE_AND_ARTIFACT_PATH if not default_artifact_root: if is_local_uri(backend_store_uri): default_artifact_root = backend_store_uri else: default_artifact_root = DEFAULT_LOCAL_FILE_AND_ARTIFACT_PATH try: initialize_backend_stores(backend_store_uri, default_artifact_root) except Exception as e: # pylint: disable=broad-except _logger.error("Error initializing backend store") _logger.exception(e) sys.exit(1) # TODO: We eventually want to disable the write path in this version of the server. try: _run_server(backend_store_uri, default_artifact_root, host, port, None, 1) except ShellCommandException: eprint( "Running the mlflow server failed. Please see the logs above for details." ) sys.exit(1)
def http_request(hostname, endpoint, method, headers=None, req_body_json=None, params=None, secure_verify=True, retries=3, retry_interval=3): """ Makes an HTTP request with the specified method to the specified hostname/endpoint. Retries up to `retries` times if a request fails with a server error (e.g. error code 500), waiting `retry_interval` seconds between successive retries. Parses the API response (assumed to be JSON) into a Python object and returns it. :param headers: Request headers to use when making the HTTP request :param req_body_json: Dictionary containing the request body :param params: Query parameters for the request :return: Parsed API response """ url = "%s%s" % (hostname, endpoint) for i in range(retries): response = requests.request(method=method, url=url, headers=headers, verify=secure_verify, params=params, json=req_body_json) if response.status_code >= 200 and response.status_code < 500: return json.loads(response.text) else: eprint( "API request to %s failed with code %s != 200, retrying up to %s more times. " "API response body: %s" % (url, response.status_code, retries - i - 1, response.text)) time.sleep(retry_interval) raise Exception( "API request to %s failed to return code 200 after %s tries" % (url, retries))
def _deploy(role, image, app_name, model_s3_path, run_id, region_name): """ Deploy model on sagemaker. :param role: SageMaker execution ARN role :param image: Name of the Docker image the model is being deployed into :param app_name: Name of the deployed app :param model_s3_path: s3 path where we stored the model artifacts :param run_id: RunId that generated this model """ sage_client = boto3.client('sagemaker', region_name=region_name) ecr_client = boto3.client("ecr") repository_conf = ecr_client.describe_repositories( repositoryNames=[image])['repositories'][0] model_name = app_name + '-model' model_response = sage_client.create_model( ModelName=model_name, PrimaryContainer={ 'ContainerHostname': 'mlflow-serve-%s' % model_name, 'Image': repository_conf["repositoryUri"], 'ModelDataUrl': model_s3_path, 'Environment': {}, }, ExecutionRoleArn=role, Tags=[ { 'Key': 'run_id', 'Value': str(run_id) }, ], ) eprint("model_arn: %s" % model_response["ModelArn"]) config_name = app_name + "-config" endpoint_config_response = sage_client.create_endpoint_config( EndpointConfigName=config_name, ProductionVariants=[ { 'VariantName': 'model1', 'ModelName': model_name, # is this the unique identifier for Model? 'InitialInstanceCount': 1, 'InstanceType': 'ml.m4.xlarge', 'InitialVariantWeight': 1, }, ], Tags=[ { 'Key': 'app_name', 'Value': app_name, }, ], ) eprint("endpoint_config_arn: %s" % endpoint_config_response["EndpointConfigArn"]) endpoint_response = sage_client.create_endpoint( EndpointName=app_name, EndpointConfigName=config_name, Tags=[], ) eprint("endpoint_arn: %s" % endpoint_response["EndpointArn"])
def _fetch_project(uri, force_tempdir, version=None, git_username=None, git_password=None): """ Fetch a project into a local directory, returning the path to the local project directory. :param force_tempdir: If True, will fetch the project into a temporary directory. Otherwise, will fetch Git projects into a temporary directory but simply return the path of local projects (i.e. perform a no-op for local projects). """ parsed_uri, subdirectory = _parse_subdirectory(uri) use_temp_dst_dir = force_tempdir or not _is_local_path(uri) dst_dir = tempfile.mkdtemp( ) if use_temp_dst_dir else urllib.parse.urlparse(uri).path if use_temp_dst_dir: eprint("=== Fetching project from %s into %s ===" % (uri, dst_dir)) # Download a project to the target `dst_dir` from a Git URI or local path. if _GIT_URI_REGEX.match(uri): # Use Git to clone the project _fetch_git_repo(parsed_uri, version, dst_dir, git_username, git_password) else: if version is not None: raise ExecutionException( "Setting a version is only supported for Git project URIs") if use_temp_dst_dir: dir_util.copy_tree(src=parsed_uri, dst=dst_dir) # Make sure there is a MLproject file in the specified working directory. if not os.path.isfile(os.path.join(dst_dir, subdirectory, "MLproject")): if subdirectory == '': raise ExecutionException("No MLproject file found in %s" % uri) else: raise ExecutionException( "No MLproject file found in subdirectory %s of %s" % (subdirectory, parsed_uri)) return os.path.abspath(os.path.join(dst_dir, subdirectory))
def ui(backend_store_uri, default_artifact_root, host, port, gunicorn_opts): """ Launch the MLflow tracking UI. The UI will be visible at http://localhost:5000 by default. """ # Ensure that both backend_store_uri and default_artifact_uri are set correctly. if not backend_store_uri: backend_store_uri = DEFAULT_LOCAL_FILE_AND_ARTIFACT_PATH if not default_artifact_root: if _is_local_uri(backend_store_uri): default_artifact_root = backend_store_uri else: default_artifact_root = DEFAULT_LOCAL_FILE_AND_ARTIFACT_PATH # TODO: We eventually want to disable the write path in this version of the server. try: _run_server(backend_store_uri, default_artifact_root, host, port, 1, None, gunicorn_opts) except ShellCommandException: eprint("Running the mlflow server failed. Please see the logs above for details.") sys.exit(1)
def _build_image(image_name, entrypoint, mlflow_home=None, custom_setup_steps_hook=None): """ Build an MLflow Docker image that can be used to serve a The image is built locally and it requires Docker to run. :param image_name: Docker image name. :param entry_point: String containing ENTRYPOINT directive for docker image :param mlflow_home: (Optional) Path to a local copy of the MLflow GitHub repository. If specified, the image will install MLflow from this directory. If None, it will install MLflow from pip. :param custom_setup_steps_hook: (Optional) Single-argument function that takes the string path of a dockerfile context directory and returns a string containing Dockerfile commands to run during the image build step. """ mlflow_home = os.path.abspath(mlflow_home) if mlflow_home else None with TempDir() as tmp: cwd = tmp.path() install_mlflow = _get_mlflow_install_step(cwd, mlflow_home) custom_setup_steps = custom_setup_steps_hook( cwd) if custom_setup_steps_hook else "" with open(os.path.join(cwd, "Dockerfile"), "w") as f: f.write( _DOCKERFILE_TEMPLATE.format( install_mlflow=install_mlflow, custom_setup_steps=custom_setup_steps, entrypoint=entrypoint)) _logger.info("Building docker image with name %s", image_name) os.system('find {cwd}/'.format(cwd=cwd)) proc = Popen( ["docker", "build", "-t", image_name, "-f", "Dockerfile", "."], cwd=cwd, stdout=PIPE, stderr=STDOUT, universal_newlines=True) for x in iter(proc.stdout.readline, ""): eprint(x, end='')
def _user_args_to_dict(arguments, argument_type="P"): user_dict = {} for arg in arguments: split = arg.split("=", maxsplit=1) # Docker arguments such as `t` don't require a value -> set to True if specified if len(split) == 1 and argument_type == "A": name = split[0] value = True elif len(split) == 2: name = split[0] value = split[1] else: eprint( "Invalid format for -%s parameter: '%s'. " "Use -%s name=value." % (argument_type, arg, argument_type) ) sys.exit(1) if name in user_dict: eprint("Repeated parameter: '%s'" % name) sys.exit(1) user_dict[name] = value return user_dict
def patch_impl(original): eprint("patch1") logger.info("patch2") warnings.warn_explicit("preamble MLflow warning", category=Warning, filename=mlflow.__file__, lineno=5) warnings.warn_explicit("preamble numpy warning", category=UserWarning, filename=np.__file__, lineno=7) original() warnings.warn_explicit("postamble MLflow warning", category=Warning, filename=mlflow.__file__, lineno=10) warnings.warn_explicit("postamble numpy warning", category=Warning, filename=np.__file__, lineno=14) logger.warning("patch3") logger.critical("patch4")
def _run_shell_command_job(project_uri, command, env_vars, cluster_spec): """ Runs the specified shell command on a Databricks cluster. :param project_uri: URI of the project from which our shell command originates :param command: Shell command to run :param env_vars: Environment variables to set in the process running `command` :param cluster_spec: Dictionary describing the cluster, expected to contain the fields for a NewCluster (see https://docs.databricks.com/api/latest/jobs.html#jobsclusterspecnewcluster) :return: The ID of the Databricks Job Run. Can be used to query the run's status via the Databricks Runs Get API (https://docs.databricks.com/api/latest/jobs.html#runs-get). """ # Make jobs API request to launch run. req_body_json = { 'run_name': 'MLflow Run for %s' % project_uri, 'new_cluster': cluster_spec, 'shell_command_task': { 'command': command, "env_vars": env_vars }, # NB: We use <= on the version specifier to allow running projects on pre-release # versions, where we will select the most up-to-date mlflow version available. # Also note, that we escape this so '<' is not treated as a shell pipe. "libraries": [{ "pypi": { "package": "'mlflow<=%s'" % VERSION } }] } run_submit_res = _jobs_runs_submit(req_body_json) databricks_run_id = run_submit_res["run_id"] eprint( "=== Launched MLflow run as Databricks job run with ID %s. Getting run status " "page URL... ===" % databricks_run_id) run_info = _jobs_runs_get(databricks_run_id) jobs_page_url = run_info["run_page_url"] eprint("=== Check the run's status at %s ===" % jobs_page_url) return databricks_run_id
def push_image_to_ecr(image=DEFAULT_IMAGE_NAME): """ Push local Docker image to AWS ECR. The image is pushed under currently active AWS account and to the currently active AWS region. :param image: Docker image name. """ eprint("Pushing image to ECR") client = boto3.client("sts") caller_id = client.get_caller_identity() account = caller_id['Account'] my_session = boto3.session.Session() region = my_session.region_name or "us-west-2" fullname = _full_template.format(account=account, region=region, image=image, version=mlflow.version.VERSION) eprint("Pushing docker image {image} to {repo}".format(image=image, repo=fullname)) ecr_client = boto3.client('ecr') try: ecr_client.describe_repositories( repositoryNames=[image])['repositories'] except ecr_client.exceptions.RepositoryNotFoundException: ecr_client.create_repository(repositoryName=image) print("Created new ECR repository: {repository_name}".format( repository_name=image)) # TODO: it would be nice to translate the docker login, tag and push to python api. # x = ecr_client.get_authorization_token()['authorizationData'][0] # docker_login_cmd = "docker login -u AWS -p {token} {url}".format(token=x['authorizationToken'] # ,url=x['proxyEndpoint']) docker_login_cmd = "$(aws ecr get-login --no-include-email)" docker_tag_cmd = "docker tag {image} {fullname}".format(image=image, fullname=fullname) docker_push_cmd = "docker push {}".format(fullname) cmd = ";\n".join([docker_login_cmd, docker_tag_cmd, docker_push_cmd]) os.system(cmd)
def http_request(host_creds, endpoint, retries=3, retry_interval=3, **kwargs): """ Makes an HTTP request with the specified method to the specified hostname/endpoint. Retries up to `retries` times if a request fails with a server error (e.g. error code 500), waiting `retry_interval` seconds between successive retries. Parses the API response (assumed to be JSON) into a Python object and returns it. :param host_creds: A :py:class:`mlflow.rest_utils.MlflowHostCreds` object containing hostname and optional authentication. :return: Parsed API response """ hostname = host_creds.host auth_str = None if host_creds.username and host_creds.password: basic_auth_str = ("%s:%s" % (host_creds.username, host_creds.password)).encode("utf-8") auth_str = "Basic " + base64.standard_b64encode(basic_auth_str).decode("utf-8") elif host_creds.token: auth_str = "Bearer %s" % host_creds.token headers = {} if auth_str: headers['Authorization'] = auth_str verify = not host_creds.ignore_tls_verification cleaned_hostname = strip_suffix(hostname, '/') url = "%s%s" % (cleaned_hostname, endpoint) for i in range(retries): response = requests.request(url=url, headers=headers, verify=verify, **kwargs) if response.status_code >= 200 and response.status_code < 500: return response else: eprint("API request to %s failed with code %s != 200, retrying up to %s more times. " "API response body: %s" % (url, response.status_code, retries - i - 1, response.text)) time.sleep(retry_interval) raise MlflowException("API request to %s failed to return code 200 after %s tries" % (url, retries))
def _run_project(project, entry_point, work_dir, parameters, use_conda, storage_dir, experiment_id, block): """Locally run a project that has been checked out in `work_dir`.""" storage_dir_for_run = _get_storage_dir(storage_dir) eprint("=== Created directory %s for downloading remote URIs passed to arguments of " "type 'path' ===" % storage_dir_for_run) # Try to build the command first in case the user mis-specified parameters run_project_command = project.get_entry_point(entry_point)\ .compute_command(parameters, storage_dir_for_run) commands = [] if use_conda: conda_env_path = os.path.abspath(os.path.join(work_dir, project.conda_env)) _maybe_create_conda_env(conda_env_path) commands.append("source activate %s" % _get_conda_env_name(conda_env_path)) # Create a new run and log every provided parameter into it. active_run = tracking._create_run( experiment_id=experiment_id, source_name=project.uri, source_version=tracking._get_git_commit(work_dir), entry_point_name=entry_point, source_type=SourceType.PROJECT) if parameters is not None: for key, value in parameters.items(): active_run.log_param(Param(key, value)) # Add the run id into a magic environment variable that the subprocess will read, # causing it to reuse the run. env_map = { tracking._RUN_ID_ENV_VAR: active_run.run_info.run_uuid, tracking._TRACKING_URI_ENV_VAR: tracking.get_tracking_uri(), tracking._EXPERIMENT_ID_ENV_VAR: str(experiment_id), } commands.append(run_project_command) command = " && ".join(commands) eprint("=== Running command '%s' in run with ID '%s' === " % (command, active_run.run_info.run_uuid)) return _launch_local_run( active_run, command, work_dir, env_map, stream_output=block)
def server(backend_store_uri, default_artifact_root, host, port, workers, static_prefix, gunicorn_opts, waitress_opts): """ Run the MLflow tracking server. The server which listen on http://localhost:5000 by default, and only accept connections from the local machine. To let the server accept connections from other machines, you will need to pass ``--host 0.0.0.0`` to listen on all network interfaces (or a specific interface address). """ _validate_server_args(gunicorn_opts=gunicorn_opts, workers=workers, waitress_opts=waitress_opts) # Ensure that both backend_store_uri and default_artifact_uri are set correctly. if not backend_store_uri: backend_store_uri = DEFAULT_LOCAL_FILE_AND_ARTIFACT_PATH if not default_artifact_root: if _is_local_uri(backend_store_uri): default_artifact_root = backend_store_uri else: eprint("Option 'default-artifact-root' is required, when backend store is not " "local file based.") sys.exit(1) try: _get_store(backend_store_uri, default_artifact_root) except Exception as e: # pylint: disable=broad-except _logger.error("Error initializing backend store") _logger.exception(e) sys.exit(1) try: _run_server(backend_store_uri, default_artifact_root, host, port, static_prefix, workers, gunicorn_opts, waitress_opts) except ShellCommandException: eprint("Running the mlflow server failed. Please see the logs above for details.") sys.exit(1)
def run_databricks(self, uri, entry_point, work_dir, parameters, experiment_id, cluster_spec, run_id): tracking_uri = _get_tracking_uri_for_run() dbfs_fuse_uri = self._upload_project_to_dbfs(work_dir, experiment_id) env_vars = { tracking._TRACKING_URI_ENV_VAR: tracking_uri, tracking._EXPERIMENT_ID_ENV_VAR: experiment_id, } eprint("=== Running entry point %s of project %s on Databricks ===" % (entry_point, uri)) # Launch run on Databricks with open(cluster_spec, 'r') as handle: try: cluster_spec = json.load(handle) except ValueError: eprint( "Error when attempting to load and parse JSON cluster spec from file " "%s. " % cluster_spec) raise command = _get_databricks_run_cmd(dbfs_fuse_uri, run_id, entry_point, parameters) return self._run_shell_command_job(uri, command, env_vars, cluster_spec)
def _invoke_mlflow_run_subprocess(work_dir, entry_point, parameters, experiment_id, use_conda, storage_dir, run_id): """ Run an MLflow project asynchronously by invoking ``mlflow run`` in a subprocess, returning a SubmittedRun that can be used to query run status. """ eprint("=== Asynchronously launching MLflow run with ID %s ===" % run_id) # Add the run id into a magic environment variable that the subprocess will read, # causing it to reuse the run. env_map = { tracking._RUN_ID_ENV_VAR: run_id, tracking._TRACKING_URI_ENV_VAR: tracking.get_tracking_uri(), tracking._EXPERIMENT_ID_ENV_VAR: str(experiment_id), } mlflow_run_arr = _build_mlflow_run_cmd(uri=work_dir, entry_point=entry_point, storage_dir=storage_dir, use_conda=use_conda, run_id=run_id, parameters=parameters) mlflow_run_subprocess = _run_mlflow_run_cmd(mlflow_run_arr, env_map) return LocalSubmittedRun(run_id, mlflow_run_subprocess)