Exemplo n.º 1
0
    def set_terminated(self, run_id, status=None, end_time=None):
        """Set a run's status to terminated.

        :param status: A string value of :py:class:`mlflow.entities.RunStatus`.
                       Defaults to "FINISHED".
        :param end_time: If not provided, defaults to the current time."""
        end_time = end_time if end_time else int(time.time() * 1000)
        status = status if status else RunStatus.to_string(RunStatus.FINISHED)
        self.store.update_run_info(run_id,
                                   run_status=RunStatus.from_string(status),
                                   end_time=end_time)
Exemplo n.º 2
0
    def create_run(self, experiment_id, user_id, start_time, tags):
        with self.ManagedSessionMaker() as session:
            experiment = self.get_experiment(experiment_id)
            self._check_experiment_is_active(experiment)

            run_id = uuid.uuid4().hex
            artifact_location = append_to_uri_path(
                experiment.artifact_location, run_id,
                SqlAlchemyStore.ARTIFACTS_FOLDER_NAME)
            run = SqlRun(name="",
                         artifact_uri=artifact_location,
                         run_uuid=run_id,
                         experiment_id=experiment_id,
                         source_type=SourceType.to_string(SourceType.UNKNOWN),
                         source_name="",
                         entry_point_name="",
                         user_id=user_id,
                         status=RunStatus.to_string(RunStatus.RUNNING),
                         start_time=start_time,
                         end_time=None,
                         source_version="",
                         lifecycle_stage=LifecycleStage.ACTIVE)

            tags_dict = {}
            for tag in tags:
                tags_dict[tag.key] = tag.value
            run.tags = [
                SqlTag(key=key, value=value)
                for key, value in tags_dict.items()
            ]
            self._save_to_db(objs=run, session=session)

            return run.to_mlflow_entity()
Exemplo n.º 3
0
def end_run(status=RunStatus.to_string(RunStatus.FINISHED)):
    """End an active MLflow run (if there is one)."""
    global _active_run_stack
    if len(_active_run_stack) > 0:
        # Clear out the global existing run environment variable as well.
        env.unset_variable(_RUN_ID_ENV_VAR)
        run = _active_run_stack.pop()
        MlflowClient().set_terminated(run.info.run_id, status)
Exemplo n.º 4
0
def test_start_run_context_manager():
    with start_run() as first_run:
        first_uuid = first_run.info.run_id
        # Check that start_run() causes the run information to be persisted in the store
        persisted_run = tracking.MlflowClient().get_run(first_uuid)
        assert persisted_run is not None
        assert persisted_run.info == first_run.info
    finished_run = tracking.MlflowClient().get_run(first_uuid)
    assert finished_run.info.status == RunStatus.to_string(RunStatus.FINISHED)
    # Launch a separate run that fails, verify the run status is FAILED and the run UUID is
    # different
    with pytest.raises(Exception):
        with start_run() as second_run:
            second_run_id = second_run.info.run_id
            raise Exception("Failing run!")
    assert second_run_id != first_uuid
    finished_run2 = tracking.MlflowClient().get_run(second_run_id)
    assert finished_run2.info.status == RunStatus.to_string(RunStatus.FAILED)
Exemplo n.º 5
0
 def test_all_status_covered(self):
     # ensure that all known status are returned. Test will fail if new status are added to PB
     all_statuses = set([
         RunStatus.RUNNING,
         RunStatus.SCHEDULED,
         RunStatus.FINISHED,
         RunStatus.FAILED,
         RunStatus.KILLED,
     ])
     self.assertSequenceEqual(all_statuses, set(RunStatus.all_status()))
Exemplo n.º 6
0
 def _create():
     run_id = str(uuid.uuid4())
     experiment_id = str(random_int(10, 2000))
     user_id = random_str(random_int(10, 25))
     status = RunStatus.to_string(random.choice(RunStatus.all_status()))
     start_time = random_int(1, 10)
     end_time = start_time + random_int(1, 10)
     lifecycle_stage = LifecycleStage.ACTIVE
     artifact_uri = random_str(random_int(10, 40))
     ri = RunInfo(run_uuid=run_id,
                  run_id=run_id,
                  experiment_id=experiment_id,
                  user_id=user_id,
                  status=status,
                  start_time=start_time,
                  end_time=end_time,
                  lifecycle_stage=lifecycle_stage,
                  artifact_uri=artifact_uri)
     return (ri, run_id, experiment_id, user_id, status, start_time,
             end_time, lifecycle_stage, artifact_uri)
Exemplo n.º 7
0
    def update_run_info(self, run_id, run_status, end_time):
        with self.ManagedSessionMaker() as session:
            run = self._get_run(run_uuid=run_id, session=session)
            self._check_run_is_active(run)
            run.status = RunStatus.to_string(run_status)
            run.end_time = end_time

            self._save_to_db(objs=run, session=session)
            run = run.to_mlflow_entity()

            return run.info
Exemplo n.º 8
0
def _maybe_set_run_terminated(active_run, status):
    """
    If the passed-in active run is defined and still running (i.e. hasn't already been terminated
    within user code), mark it as terminated with the passed-in status.
    """
    if active_run is None:
        return
    run_id = active_run.info.run_id
    cur_status = tracking.MlflowClient().get_run(run_id).info.status
    if RunStatus.is_terminated(cur_status):
        return
    tracking.MlflowClient().set_terminated(run_id, status)
Exemplo n.º 9
0
def test_run_local_git_repo(local_git_repo, local_git_repo_uri, use_start_run,
                            version):
    if version is not None:
        uri = local_git_repo_uri + "#" + TEST_PROJECT_NAME
    else:
        uri = os.path.join("%s/" % local_git_repo, TEST_PROJECT_NAME)
    if version == "git-commit":
        version = _get_version_local_git_repo(local_git_repo)
    submitted_run = kiwi.projects.run(
        uri,
        entry_point="test_tracking",
        version=version,
        parameters={"use_start_run": use_start_run},
        use_conda=False,
        experiment_id=FileStore.DEFAULT_EXPERIMENT_ID)

    # Blocking runs should be finished when they return
    validate_exit_status(submitted_run.get_status(), RunStatus.FINISHED)
    # Test that we can call wait() on a synchronous run & that the run has the correct
    # status after calling wait().
    submitted_run.wait()
    validate_exit_status(submitted_run.get_status(), RunStatus.FINISHED)
    # Validate run contents in the FileStore
    run_id = submitted_run.run_id
    mlflow_service = kiwi.tracking.MlflowClient()
    run_infos = mlflow_service.list_run_infos(
        experiment_id=FileStore.DEFAULT_EXPERIMENT_ID,
        run_view_type=ViewType.ACTIVE_ONLY)
    assert len(run_infos) == 1
    store_run_id = run_infos[0].run_id
    assert run_id == store_run_id
    run = mlflow_service.get_run(run_id)

    assert run.info.status == RunStatus.to_string(RunStatus.FINISHED)
    assert run.data.params == {"use_start_run": use_start_run}
    assert run.data.metrics == {"some_key": 3}

    tags = run.data.tags
    assert tags[MLFLOW_USER] == MOCK_USER
    assert "file:" in tags[MLFLOW_SOURCE_NAME]
    assert tags[MLFLOW_SOURCE_TYPE] == SourceType.to_string(SourceType.PROJECT)
    assert tags[MLFLOW_PROJECT_ENTRY_POINT] == "test_tracking"
    assert tags[MLFLOW_PROJECT_BACKEND] == "local"

    if version == "master":
        assert tags[MLFLOW_GIT_BRANCH] == "master"
        assert tags[MLFLOW_GIT_REPO_URL] == local_git_repo_uri
        assert tags[LEGACY_MLFLOW_GIT_BRANCH_NAME] == "master"
        assert tags[LEGACY_MLFLOW_GIT_REPO_URL] == local_git_repo_uri
Exemplo n.º 10
0
 def cancel(self):
     with self._status_lock:
         if not RunStatus.is_terminated(self._status):
             _logger.info("Cancelling job.")
             kube_api = kubernetes.client.BatchV1Api()
             kube_api.delete_namespaced_job(
                 name=self._job_name,
                 namespace=self._job_namespace,
                 body=kubernetes.client.V1DeleteOptions(),
                 pretty=True)
             self._status = RunStatus.KILLED
             _logger.info("Job cancelled.")
         else:
             _logger.info(
                 "Attempting to cancel a job that is already terminated.")
Exemplo n.º 11
0
def _read_persisted_run_info_dict(run_info_dict):
    dict_copy = run_info_dict.copy()
    if 'lifecycle_stage' not in dict_copy:
        dict_copy['lifecycle_stage'] = LifecycleStage.ACTIVE
    # 'status' is stored as an integer enum in meta file, but RunInfo.status field is a string.
    # converting to string before hydrating RunInfo.
    # If 'status' value not recorded in files, mark it as 'RUNNING' (default)
    dict_copy['status'] = RunStatus.to_string(
        run_info_dict.get('status', RunStatus.RUNNING))

    # 'experiment_id' was changed from int to string, so we must cast to string
    # when reading legacy run_infos
    if isinstance(dict_copy["experiment_id"], int):
        dict_copy["experiment_id"] = str(dict_copy["experiment_id"])
    return RunInfo.from_dictionary(dict_copy)
Exemplo n.º 12
0
def _make_persisted_run_info_dict(run_info):
    # 'tags' was moved from RunInfo to RunData, so we must keep storing it in the meta.yaml for
    # old mlflow versions to read
    run_info_dict = dict(run_info)
    run_info_dict['tags'] = []
    run_info_dict['name'] = ''
    if 'status' in run_info_dict:
        # 'status' is stored as an integer enum in meta file, but RunInfo.status field is a string.
        # Convert from string to enum/int before storing.
        run_info_dict['status'] = RunStatus.from_string(run_info.status)
    else:
        run_info_dict['status'] = RunStatus.RUNNING
    run_info_dict['source_type'] = SourceType.LOCAL
    run_info_dict['source_name'] = ''
    run_info_dict['entry_point_name'] = ''
    run_info_dict['source_version'] = ''
    return run_info_dict
Exemplo n.º 13
0
 def _update_status(self, kube_api=kubernetes.client.BatchV1Api()):
     api_response = kube_api.read_namespaced_job_status(
         name=self._job_name, namespace=self._job_namespace, pretty=True)
     status = api_response.status
     with self._status_lock:
         if RunStatus.is_terminated(self._status):
             return self._status
         if self._status == RunStatus.SCHEDULED:
             if api_response.status.start_time is None:
                 _logger.info("Waiting for Job to start")
             else:
                 _logger.info("Job started.")
                 self._status = RunStatus.RUNNING
         if status.conditions is not None:
             for condition in status.conditions:
                 if condition.status == "True":
                     _logger.info(condition.message)
                     if condition.type == "Failed":
                         self._status = RunStatus.FAILED
                     elif condition.type == "Complete":
                         self._status = RunStatus.FINISHED
     return self._status
Exemplo n.º 14
0
 def create_run(self, experiment_id, user_id, start_time, tags):
     """
     Creates a run with the specified attributes.
     """
     experiment_id = FileStore.DEFAULT_EXPERIMENT_ID if experiment_id is None else experiment_id
     experiment = self.get_experiment(experiment_id)
     if experiment is None:
         raise MlflowException(
             "Could not create run under experiment with ID %s - no such experiment "
             "exists." % experiment_id,
             databricks_pb2.RESOURCE_DOES_NOT_EXIST)
     if experiment.lifecycle_stage != LifecycleStage.ACTIVE:
         raise MlflowException(
             "Could not create run under non-active experiment with ID "
             "%s." % experiment_id, databricks_pb2.INVALID_STATE)
     run_uuid = uuid.uuid4().hex
     artifact_uri = self._get_artifact_dir(experiment_id, run_uuid)
     run_info = RunInfo(run_uuid=run_uuid,
                        run_id=run_uuid,
                        experiment_id=experiment_id,
                        artifact_uri=artifact_uri,
                        user_id=user_id,
                        status=RunStatus.to_string(RunStatus.RUNNING),
                        start_time=start_time,
                        end_time=None,
                        lifecycle_stage=LifecycleStage.ACTIVE)
     # Persist run metadata and create directories for logging metrics, parameters, artifacts
     run_dir = self._get_run_dir(run_info.experiment_id, run_info.run_id)
     mkdir(run_dir)
     run_info_dict = _make_persisted_run_info_dict(run_info)
     write_yaml(run_dir, FileStore.META_DATA_FILE_NAME, run_info_dict)
     mkdir(run_dir, FileStore.METRICS_FOLDER_NAME)
     mkdir(run_dir, FileStore.PARAMS_FOLDER_NAME)
     mkdir(run_dir, FileStore.ARTIFACTS_FOLDER_NAME)
     for tag in tags:
         self.set_tag(run_uuid, tag)
     return self.get_run(run_id=run_uuid)
Exemplo n.º 15
0
def test_run(use_start_run):
    submitted_run = kiwi.projects.run(
        TEST_PROJECT_DIR,
        entry_point="test_tracking",
        parameters={"use_start_run": use_start_run},
        use_conda=False,
        experiment_id=FileStore.DEFAULT_EXPERIMENT_ID)
    assert submitted_run.run_id is not None
    # Blocking runs should be finished when they return
    validate_exit_status(submitted_run.get_status(), RunStatus.FINISHED)
    # Test that we can call wait() on a synchronous run & that the run has the correct
    # status after calling wait().
    submitted_run.wait()
    validate_exit_status(submitted_run.get_status(), RunStatus.FINISHED)
    # Validate run contents in the FileStore
    run_id = submitted_run.run_id
    mlflow_service = kiwi.tracking.MlflowClient()

    run_infos = mlflow_service.list_run_infos(
        experiment_id=FileStore.DEFAULT_EXPERIMENT_ID,
        run_view_type=ViewType.ACTIVE_ONLY)
    assert len(run_infos) == 1
    store_run_id = run_infos[0].run_id
    assert run_id == store_run_id
    run = mlflow_service.get_run(run_id)

    assert run.info.status == RunStatus.to_string(RunStatus.FINISHED)

    assert run.data.params == {"use_start_run": use_start_run}
    assert run.data.metrics == {"some_key": 3}

    tags = run.data.tags
    assert tags[MLFLOW_USER] == MOCK_USER
    assert "file:" in tags[MLFLOW_SOURCE_NAME]
    assert tags[MLFLOW_SOURCE_TYPE] == SourceType.to_string(SourceType.PROJECT)
    assert tags[MLFLOW_PROJECT_ENTRY_POINT] == "test_tracking"
Exemplo n.º 16
0
 def __exit__(self, exc_type, exc_val, exc_tb):
     status = RunStatus.FINISHED if exc_type is None else RunStatus.FAILED
     end_run(RunStatus.to_string(status))
     return exc_type is None
Exemplo n.º 17
0
def validate_exit_status(status_str, expected):
    assert RunStatus.from_string(status_str) == expected
Exemplo n.º 18
0
class SqlRun(Base):
    """
    DB model for :py:class:`mlflow.entities.Run`. These are recorded in ``runs`` table.
    """
    __tablename__ = 'runs'

    run_uuid = Column(String(32), nullable=False)
    """
    Run UUID: `String` (limit 32 characters). *Primary Key* for ``runs`` table.
    """
    name = Column(String(250))
    """
    Run name: `String` (limit 250 characters).
    """
    source_type = Column(String(20),
                         default=SourceType.to_string(SourceType.LOCAL))
    """
    Source Type: `String` (limit 20 characters). Can be one of ``NOTEBOOK``, ``JOB``, ``PROJECT``,
                 ``LOCAL`` (default), or ``UNKNOWN``.
    """
    source_name = Column(String(500))
    """
    Name of source recording the run: `String` (limit 500 characters).
    """
    entry_point_name = Column(String(50))
    """
    Entry-point name that launched the run run: `String` (limit 50 characters).
    """
    user_id = Column(String(256), nullable=True, default=None)
    """
    User ID: `String` (limit 256 characters). Defaults to ``null``.
    """
    status = Column(String(20),
                    default=RunStatus.to_string(RunStatus.SCHEDULED))
    """
    Run Status: `String` (limit 20 characters). Can be one of ``RUNNING``, ``SCHEDULED`` (default),
                ``FINISHED``, ``FAILED``.
    """
    start_time = Column(BigInteger, default=int(time.time()))
    """
    Run start time: `BigInteger`. Defaults to current system time.
    """
    end_time = Column(BigInteger, nullable=True, default=None)
    """
    Run end time: `BigInteger`.
    """
    source_version = Column(String(50))
    """
    Source version: `String` (limit 50 characters).
    """
    lifecycle_stage = Column(String(20), default=LifecycleStage.ACTIVE)
    """
    Lifecycle Stage of run: `String` (limit 32 characters).
                            Can be either ``active`` (default) or ``deleted``.
    """
    artifact_uri = Column(String(200), default=None)
    """
    Default artifact location for this run: `String` (limit 200 characters).
    """
    experiment_id = Column(Integer, ForeignKey('experiments.experiment_id'))
    """
    Experiment ID to which this run belongs to: *Foreign Key* into ``experiment`` table.
    """
    experiment = relationship('SqlExperiment',
                              backref=backref('runs', cascade='all'))
    """
    SQLAlchemy relationship (many:one) with :py:class:`mlflow.store.dbmodels.models.SqlExperiment`.
    """

    __table_args__ = (CheckConstraint(source_type.in_(SourceTypes),
                                      name='source_type'),
                      CheckConstraint(status.in_(RunStatusTypes),
                                      name='status'),
                      CheckConstraint(lifecycle_stage.in_(
                          LifecycleStage.view_type_to_stages(ViewType.ALL)),
                                      name='runs_lifecycle_stage'),
                      PrimaryKeyConstraint('run_uuid', name='run_pk'))

    @staticmethod
    def get_attribute_name(mlflow_attribute_name):
        """
        Resolves an MLflow attribute name to a `SqlRun` attribute name.
        """
        # Currently, MLflow Search attributes defined in `SearchUtils.VALID_SEARCH_ATTRIBUTE_KEYS`
        # share the same names as their corresponding `SqlRun` attributes. Therefore, this function
        # returns the same attribute name
        return mlflow_attribute_name

    def to_mlflow_entity(self):
        """
        Convert DB model to corresponding MLflow entity.

        :return: :py:class:`mlflow.entities.Run`.
        """
        run_info = RunInfo(run_uuid=self.run_uuid,
                           run_id=self.run_uuid,
                           experiment_id=str(self.experiment_id),
                           user_id=self.user_id,
                           status=self.status,
                           start_time=self.start_time,
                           end_time=self.end_time,
                           lifecycle_stage=self.lifecycle_stage,
                           artifact_uri=self.artifact_uri)

        run_data = RunData(
            metrics=[m.to_mlflow_entity() for m in self.latest_metrics],
            params=[p.to_mlflow_entity() for p in self.params],
            tags=[t.to_mlflow_entity() for t in self.tags])

        return Run(run_info=run_info, run_data=run_data)
Exemplo n.º 19
0
 def test_is_terminated(self):
     self.assertTrue(RunStatus.is_terminated(RunStatus.FAILED))
     self.assertTrue(RunStatus.is_terminated(RunStatus.FINISHED))
     self.assertTrue(RunStatus.is_terminated(RunStatus.KILLED))
     self.assertFalse(RunStatus.is_terminated(RunStatus.SCHEDULED))
     self.assertFalse(RunStatus.is_terminated(RunStatus.RUNNING))
Exemplo n.º 20
0
    def test_status_mappings(self):
        # test enum to string mappings
        self.assertEqual("RUNNING", RunStatus.to_string(RunStatus.RUNNING))
        self.assertEqual(RunStatus.RUNNING, RunStatus.from_string("RUNNING"))

        self.assertEqual("SCHEDULED", RunStatus.to_string(RunStatus.SCHEDULED))
        self.assertEqual(RunStatus.SCHEDULED,
                         RunStatus.from_string("SCHEDULED"))

        self.assertEqual("FINISHED", RunStatus.to_string(RunStatus.FINISHED))
        self.assertEqual(RunStatus.FINISHED, RunStatus.from_string("FINISHED"))

        self.assertEqual("FAILED", RunStatus.to_string(RunStatus.FAILED))
        self.assertEqual(RunStatus.FAILED, RunStatus.from_string("FAILED"))

        self.assertEqual("KILLED", RunStatus.to_string(RunStatus.KILLED))
        self.assertEqual(RunStatus.KILLED, RunStatus.from_string("KILLED"))

        with self.assertRaises(Exception) as e:
            RunStatus.to_string(-120)
        self.assertIn("Could not get string corresponding to run status -120",
                      str(e.exception))

        with self.assertRaises(Exception) as e:
            RunStatus.from_string("the IMPOSSIBLE status string")
        self.assertIn(
            "Could not get run status corresponding to string the IMPO",
            str(e.exception))
Exemplo n.º 21
0
    def wait(self):
        kube_api = kubernetes.client.BatchV1Api()
        while not RunStatus.is_terminated(self._update_status(kube_api)):
            time.sleep(self.POLL_STATUS_INTERVAL)

        return self._status == RunStatus.FINISHED
Exemplo n.º 22
0
                        BigInteger, PrimaryKeyConstraint, Boolean)
from kiwi.entities import (Experiment, RunTag, Metric, Param, RunData, RunInfo,
                           SourceType, RunStatus, Run, ViewType, ExperimentTag)
from kiwi.entities.lifecycle_stage import LifecycleStage
from kiwi.store.db.base_sql_model import Base

SourceTypes = [
    SourceType.to_string(SourceType.NOTEBOOK),
    SourceType.to_string(SourceType.JOB),
    SourceType.to_string(SourceType.LOCAL),
    SourceType.to_string(SourceType.UNKNOWN),
    SourceType.to_string(SourceType.PROJECT)
]

RunStatusTypes = [
    RunStatus.to_string(RunStatus.SCHEDULED),
    RunStatus.to_string(RunStatus.FAILED),
    RunStatus.to_string(RunStatus.FINISHED),
    RunStatus.to_string(RunStatus.RUNNING),
    RunStatus.to_string(RunStatus.KILLED)
]


class SqlExperiment(Base):
    """
    DB model for :py:class:`mlflow.entities.Experiment`. These are recorded in ``experiment`` table.
    """
    __tablename__ = 'experiments'

    experiment_id = Column(Integer, autoincrement=True)
    """
Exemplo n.º 23
0
 def get_status(self):
     status = self._status
     return status if RunStatus.is_terminated(
         status) else self._update_status()
Exemplo n.º 24
0
 def get_status(self):
     return RunStatus.to_string(self._get_status())
Exemplo n.º 25
0
 def get_status(self, databricks_run_id):
     return RunStatus.to_string(self._get_status(databricks_run_id))