예제 #1
0
 def __init__(self, client=None, export_metadata_tags=False, notebook_formats=["SOURCE"], filesystem=None):
     self.client = client or mlflow.tracking.MlflowClient()
     self.dbx_client = DatabricksHttpClient()
     print("Databricks REST client:",self.dbx_client)
     self.fs = filesystem or _filesystem.get_filesystem()
     print("Filesystem:",type(self.fs).__name__)
     self.export_metadata_tags = export_metadata_tags
     self.notebook_formats = notebook_formats
예제 #2
0
 def __init__(self,
              client=None,
              export_metadata_tags=False,
              notebook_formats=["SOURCE"],
              filesystem=None):
     self.client = client or mlflow.tracking.MlflowClient()
     self.fs = filesystem or _filesystem.get_filesystem()
     print("Filesystem:", type(self.fs).__name__)
     self.run_exporter = RunExporter(self.client, export_metadata_tags,
                                     notebook_formats, self.fs)
예제 #3
0
 def __init__(self,
              export_metadata_tags=False,
              notebook_formats=["SOURCE"],
              filesystem=None):
     self.fs = filesystem or _filesystem.get_filesystem()
     self.client = mlflow.tracking.MlflowClient()
     self.client2 = HttpClient("api/2.0/preview/mlflow")
     self.run_exporter = RunExporter(
         self.client,
         export_metadata_tags=export_metadata_tags,
         notebook_formats=notebook_formats,
         filesystem=filesystem)
예제 #4
0
def _export_models(models, output_dir, notebook_formats, export_notebook_revision, stages, export_run=True, use_threads=False):
    max_workers = os.cpu_count() or 4 if use_threads else 1
    start_time = time.time()
    if models == "all":
        models = [ model.name for model in client.list_registered_models() ]
    elif models.endswith("*"):
        model_prefix = models[:-1]
        models = [ model.name for model in client.list_registered_models() if model.name.startswith(model_prefix) ] # Wish there was an model search method for efficiency]
    else:
        models = models.split(",")
    print("Models:")
    for model in models:
        print(f"  {model}")

    exporter = ModelExporter(stages=stages, notebook_formats=utils.string_to_list(notebook_formats), export_notebook_revision=export_notebook_revision, export_run=export_run)
    futures = []
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        for model in models:
            dir = os.path.join(output_dir, model)
            future = executor.submit(exporter.export_model, model, dir)
            futures.append(future)
    ok_models = [] ; failed_models = []
    for future in futures:
        result = future.result()
        if result[0]: ok_models.append(result[1])
        else: failed_models.append(result[1])

    duration = round(time.time() - start_time, 1)
    manifest = {
        "info": {
            "mlflow_version": mlflow.__version__,
            "mlflow_tracking_uri": mlflow.get_tracking_uri(),
            "export_time": utils.get_now_nice(),
            "total_models": len(models),
            "ok_models": len(ok_models),
            "failed_models": len(failed_models),
            "duration": duration
        },
        "stages": stages,
        "notebook_formats": notebook_formats,
        "export_notebook_revision": export_notebook_revision,
        "ok_models": ok_models,
        "failed_models": failed_models
    }

    fs = _filesystem.get_filesystem(output_dir)
    fs.mkdirs(output_dir)
    with open(os.path.join(output_dir, "manifest.json"), "w") as f:
        f.write(json.dumps(manifest, indent=2)+"\n")

    print(f"{len(models)} models exported")
    print(f"Duration for registered models export: {duration} seconds")
예제 #5
0
 def _export_model(self, model_name, output_dir):
     fs = _filesystem.get_filesystem(output_dir)
     model = self.http_client.get(f"registered-models/get",
                                  {"name": model_name})
     fs.mkdirs(output_dir)
     model["registered_model"]["latest_versions"] = []
     versions = self.mlflow_client.search_model_versions(
         f"name='{model_name}'")
     print(f"Found {len(versions)} versions for model {model_name}")
     manifest = []
     exported_versions = 0
     for vr in versions:
         if len(self.stages) > 0 and not vr.current_stage.lower(
         ) in self.stages:
             continue
         run_id = vr.run_id
         opath = os.path.join(output_dir, run_id)
         opath = opath.replace("dbfs:", "/dbfs")
         dct = {
             "version": vr.version,
             "stage": vr.current_stage,
             "run_id": run_id
         }
         print(f"Exporting: {dct}")
         manifest.append(dct)
         try:
             if self.export_run:
                 self.run_exporter.export_run(run_id, opath)
             run = self.mlflow_client.get_run(run_id)
             dct = dict(vr)
             dct["_run_artifact_uri"] = run.info.artifact_uri
             experiment = mlflow.get_experiment(run.info.experiment_id)
             dct["_experiment_name"] = experiment.name
             model["registered_model"]["latest_versions"].append(dct)
             exported_versions += 1
         except mlflow.exceptions.RestException as e:
             if "RESOURCE_DOES_NOT_EXIST: Run" in str(e):
                 print(
                     f"WARNING: Run for version {vr.version} does not exist. {e}"
                 )
             else:
                 import traceback
                 traceback.print_exc()
     print(
         f"Exported {exported_versions}/{len(versions)} versions for model {model_name}"
     )
     path = os.path.join(output_dir, "model.json")
     utils.write_json_file(fs, path, model)
     return manifest
    def export_experiment(self, exp_id_or_name, output_dir, run_ids=None):
        """
        :param exp_id_or_name: Experiment ID or name.
        :param output_dir: Output directory.
        :param run_ids: List of run IDs to export. If None export all run IDs.
        :return: Number of successful and number of failed runs.
        """
        exp = mlflow_utils.get_experiment(self.mlflow_client, exp_id_or_name)
        exp_id = exp.experiment_id
        print(
            f"Exporting experiment '{exp.name}' (ID {exp.experiment_id}) to '{output_dir}'"
        )
        fs = _filesystem.get_filesystem(output_dir)
        print("Filesystem:", type(fs).__name__)
        fs.mkdirs(output_dir)
        exp = self.mlflow_client.get_experiment(exp_id)
        dct = {"experiment": utils.strip_underscores(exp)}
        ok_run_ids = []
        failed_run_ids = []
        j = -1
        if run_ids:
            for j, run_id in enumerate(run_ids):
                run = self.mlflow_client.get_run(run_id)
                self._export_run(j, run, output_dir, ok_run_ids,
                                 failed_run_ids)
        else:
            for j, run in enumerate(
                    SearchRunsIterator(self.mlflow_client, exp_id)):
                self._export_run(j, run, output_dir, ok_run_ids,
                                 failed_run_ids)
        dct["export_info"] = {
            "export_time": utils.get_now_nice(),
            "num_total_runs": (j + 1),
            "num_ok_runs": len(ok_run_ids),
            "ok_runs": ok_run_ids,
            "num_failed_runs": len(failed_run_ids),
            "failed_runs": failed_run_ids
        }

        path = os.path.join(output_dir, "manifest.json")
        utils.write_json_file(fs, path, dct)
        msg = f"for experiment '{exp.name}' (ID: {exp.experiment_id})"
        if len(failed_run_ids) == 0:
            print(f"All {len(ok_run_ids)} runs succesfully exported {msg}")
        else:
            print(f"{len(ok_run_ids)/j} runs succesfully exported {msg}")
            print(f"{len(failed_run_ids)/j} runs failed {msg}")
        return len(ok_run_ids), len(failed_run_ids)
예제 #7
0
def import_all(input_dir, delete_model, use_src_user_id, import_mlflow_tags,
               import_metadata_tags, verbose, use_threads):
    start_time = time.time()
    exp_res = import_experiments(input_dir, use_src_user_id,
                                 import_mlflow_tags, import_metadata_tags)
    run_info_map = _remap(exp_res[0])
    model_res = import_models(input_dir, run_info_map, delete_model, verbose,
                              use_threads)
    duration = round(time.time() - start_time, 1)
    dct = {
        "duration": duration,
        "experiment_import": exp_res[1],
        "model_import": model_res
    }
    fs = _filesystem.get_filesystem(".")
    utils.write_json_file(fs, "import_report.json", dct)
    print("\nImport report:")
    print(json.dumps(dct, indent=2) + "\n")
예제 #8
0
    def export_run(self, run_id, output_dir):
        """
        :param run_id: Run ID.
        :param output_dir: Output directory.
        :return: whether export succeeded.
        """
        fs = _filesystem.get_filesystem(output_dir)
        run = self.mlflow_client.get_run(run_id)
        fs.mkdirs(output_dir)
        tags =  utils.create_tags_for_metadata(self.mlflow_client, run, self.export_metadata_tags)
        dct = { "info": utils.strip_underscores(run.info) , 
                "params": run.data.params,
                "metrics": run.data.metrics,
                "tags": tags,
              }
        path = os.path.join(output_dir,"run.json")
        utils.write_json_file(fs, path, dct)

        # copy artifacts
        dst_path = os.path.join(output_dir,"artifacts")
        try:
            TAG_NOTEBOOK_PATH = "mlflow.databricks.notebookPath"
            artifacts = self.mlflow_client.list_artifacts(run.info.run_id)
            if len(artifacts) > 0: # Because of https://github.com/mlflow/mlflow/issues/2839
                fs.mkdirs(dst_path)
                self.mlflow_client.download_artifacts(run.info.run_id,"", dst_path=mk_local_path(dst_path))
            notebook = tags.get(TAG_NOTEBOOK_PATH, None)
            if notebook is not None:
                if len(self.notebook_formats) > 0:
                    self.export_notebook(output_dir, notebook, run.data.tags, fs)
            elif len(self.notebook_formats) > 0:
                print(f"WARNING: Cannot export notebook since tag '{TAG_NOTEBOOK_PATH}' is not set.")
            return True
        except Exception as e:
            print("ERROR: run_id:",run.info.run_id,"Exception:",e)
            traceback.print_exc()
            return False
def export_experiments(experiments,
                       output_dir,
                       export_metadata_tags,
                       notebook_formats,
                       export_notebook_revision=False,
                       use_threads=False):
    """
    :param: experiments: Can be either:
      - List of experiment names 
      - List of experiment IDs
      - Dictionary with experiment ID key and list of run IDs 
      - String with comma-delimited experiment names or IDs.
    """
    start_time = time.time()
    max_workers = os.cpu_count() or 4 if use_threads else 1

    export_all_runs = not isinstance(experiments, dict)
    if export_all_runs:
        experiments = utils.get_experiments(experiments)
        table_data = experiments
        columns = ["Experiment ID"]
        experiments_dct = {}
    else:
        experiments_dct = experiments
        experiments = experiments.keys()
        experiments = utils.get_experiments(experiments)
        table_data = [[exp_id, len(runs)]
                      for exp_id, runs in experiments_dct.items()]
        num_runs = sum(x[1] for x in table_data)
        table_data.append(["Total", num_runs])
        columns = ["Experiment ID", "# Runs"]
    utils.show_table("Experiments", table_data, columns)
    print("")

    ok_runs = 0
    failed_runs = 0
    export_results = []
    futures = []
    exporter = ExperimentExporter(client, export_metadata_tags,
                                  utils.string_to_list(notebook_formats),
                                  export_notebook_revision)
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        for exp_id_or_name in experiments:
            run_ids = experiments_dct.get(exp_id_or_name, None)
            future = executor.submit(_export_experiment, exp_id_or_name,
                                     output_dir, exporter, export_results,
                                     run_ids)
            futures.append(future)
    duration = round(time.time() - start_time, 1)
    ok_runs = 0
    failed_runs = 0
    for future in futures:
        result = future.result()
        ok_runs += result[0]
        failed_runs += result[1]

    total_runs = ok_runs + failed_runs
    duration = round(time.time() - start_time, 1)
    dct = {
        "info": {
            "mlflow_version": mlflow.__version__,
            "mlflow_tracking_uri": mlflow.get_tracking_uri(),
            "export_time": utils.get_now_nice(),
            "duration": duration,
            "experiments": len(experiments),
            "total_runs": total_runs,
            "ok_runs": ok_runs,
            "failed_runs": failed_runs
        },
        "experiments": export_results
    }
    fs = _filesystem.get_filesystem(output_dir)
    fs.mkdirs(output_dir)
    with open(os.path.join(output_dir, "manifest.json"), "w") as f:
        f.write(json.dumps(dct, indent=2) + "\n")

    print(f"{len(experiments)} experiments exported")
    print(f"{ok_runs}/{total_runs} runs succesfully exported")
    if failed_runs > 0:
        print(f"{failed_runs}/{total_runs} runs failed")
    print(f"Duration for experiments export: {duration} seconds")
예제 #10
0
 def __init__(self, filesystem=None):
     self.fs = filesystem or _filesystem.get_filesystem()
     self.client = mlflow.tracking.MlflowClient()
     self.run_importer = RunImporter(self.client, mlmodel_fix=True)
예제 #11
0
 def __init__(self, filesystem=None, run_importer=None):
     self.fs = filesystem or _filesystem.get_filesystem()
     self.client = mlflow.tracking.MlflowClient()
     self.run_importer = run_importer if run_importer else RunImporter(
         self.client, mlmodel_fix=True, import_mlflow_tags=True)