def set_terminated(self, run_id, status=None, end_time=None): """Set a run's status to terminated. :param status: A string value of :py:class:`mlflow.entities.RunStatus`. Defaults to "FINISHED". :param end_time: If not provided, defaults to the current time.""" end_time = end_time if end_time else int(time.time() * 1000) status = status if status else RunStatus.to_string(RunStatus.FINISHED) self.store.update_run_info(run_id, run_status=RunStatus.from_string(status), end_time=end_time)
def create_run(self, experiment_id, user_id, start_time, tags): with self.ManagedSessionMaker() as session: experiment = self.get_experiment(experiment_id) self._check_experiment_is_active(experiment) run_id = uuid.uuid4().hex artifact_location = append_to_uri_path( experiment.artifact_location, run_id, SqlAlchemyStore.ARTIFACTS_FOLDER_NAME) run = SqlRun(name="", artifact_uri=artifact_location, run_uuid=run_id, experiment_id=experiment_id, source_type=SourceType.to_string(SourceType.UNKNOWN), source_name="", entry_point_name="", user_id=user_id, status=RunStatus.to_string(RunStatus.RUNNING), start_time=start_time, end_time=None, source_version="", lifecycle_stage=LifecycleStage.ACTIVE) tags_dict = {} for tag in tags: tags_dict[tag.key] = tag.value run.tags = [ SqlTag(key=key, value=value) for key, value in tags_dict.items() ] self._save_to_db(objs=run, session=session) return run.to_mlflow_entity()
def end_run(status=RunStatus.to_string(RunStatus.FINISHED)): """End an active MLflow run (if there is one).""" global _active_run_stack if len(_active_run_stack) > 0: # Clear out the global existing run environment variable as well. env.unset_variable(_RUN_ID_ENV_VAR) run = _active_run_stack.pop() MlflowClient().set_terminated(run.info.run_id, status)
def test_start_run_context_manager(): with start_run() as first_run: first_uuid = first_run.info.run_id # Check that start_run() causes the run information to be persisted in the store persisted_run = tracking.MlflowClient().get_run(first_uuid) assert persisted_run is not None assert persisted_run.info == first_run.info finished_run = tracking.MlflowClient().get_run(first_uuid) assert finished_run.info.status == RunStatus.to_string(RunStatus.FINISHED) # Launch a separate run that fails, verify the run status is FAILED and the run UUID is # different with pytest.raises(Exception): with start_run() as second_run: second_run_id = second_run.info.run_id raise Exception("Failing run!") assert second_run_id != first_uuid finished_run2 = tracking.MlflowClient().get_run(second_run_id) assert finished_run2.info.status == RunStatus.to_string(RunStatus.FAILED)
def test_all_status_covered(self): # ensure that all known status are returned. Test will fail if new status are added to PB all_statuses = set([ RunStatus.RUNNING, RunStatus.SCHEDULED, RunStatus.FINISHED, RunStatus.FAILED, RunStatus.KILLED, ]) self.assertSequenceEqual(all_statuses, set(RunStatus.all_status()))
def _create(): run_id = str(uuid.uuid4()) experiment_id = str(random_int(10, 2000)) user_id = random_str(random_int(10, 25)) status = RunStatus.to_string(random.choice(RunStatus.all_status())) start_time = random_int(1, 10) end_time = start_time + random_int(1, 10) lifecycle_stage = LifecycleStage.ACTIVE artifact_uri = random_str(random_int(10, 40)) ri = RunInfo(run_uuid=run_id, run_id=run_id, experiment_id=experiment_id, user_id=user_id, status=status, start_time=start_time, end_time=end_time, lifecycle_stage=lifecycle_stage, artifact_uri=artifact_uri) return (ri, run_id, experiment_id, user_id, status, start_time, end_time, lifecycle_stage, artifact_uri)
def update_run_info(self, run_id, run_status, end_time): with self.ManagedSessionMaker() as session: run = self._get_run(run_uuid=run_id, session=session) self._check_run_is_active(run) run.status = RunStatus.to_string(run_status) run.end_time = end_time self._save_to_db(objs=run, session=session) run = run.to_mlflow_entity() return run.info
def _maybe_set_run_terminated(active_run, status): """ If the passed-in active run is defined and still running (i.e. hasn't already been terminated within user code), mark it as terminated with the passed-in status. """ if active_run is None: return run_id = active_run.info.run_id cur_status = tracking.MlflowClient().get_run(run_id).info.status if RunStatus.is_terminated(cur_status): return tracking.MlflowClient().set_terminated(run_id, status)
def test_run_local_git_repo(local_git_repo, local_git_repo_uri, use_start_run, version): if version is not None: uri = local_git_repo_uri + "#" + TEST_PROJECT_NAME else: uri = os.path.join("%s/" % local_git_repo, TEST_PROJECT_NAME) if version == "git-commit": version = _get_version_local_git_repo(local_git_repo) submitted_run = kiwi.projects.run( uri, entry_point="test_tracking", version=version, parameters={"use_start_run": use_start_run}, use_conda=False, experiment_id=FileStore.DEFAULT_EXPERIMENT_ID) # Blocking runs should be finished when they return validate_exit_status(submitted_run.get_status(), RunStatus.FINISHED) # Test that we can call wait() on a synchronous run & that the run has the correct # status after calling wait(). submitted_run.wait() validate_exit_status(submitted_run.get_status(), RunStatus.FINISHED) # Validate run contents in the FileStore run_id = submitted_run.run_id mlflow_service = kiwi.tracking.MlflowClient() run_infos = mlflow_service.list_run_infos( experiment_id=FileStore.DEFAULT_EXPERIMENT_ID, run_view_type=ViewType.ACTIVE_ONLY) assert len(run_infos) == 1 store_run_id = run_infos[0].run_id assert run_id == store_run_id run = mlflow_service.get_run(run_id) assert run.info.status == RunStatus.to_string(RunStatus.FINISHED) assert run.data.params == {"use_start_run": use_start_run} assert run.data.metrics == {"some_key": 3} tags = run.data.tags assert tags[MLFLOW_USER] == MOCK_USER assert "file:" in tags[MLFLOW_SOURCE_NAME] assert tags[MLFLOW_SOURCE_TYPE] == SourceType.to_string(SourceType.PROJECT) assert tags[MLFLOW_PROJECT_ENTRY_POINT] == "test_tracking" assert tags[MLFLOW_PROJECT_BACKEND] == "local" if version == "master": assert tags[MLFLOW_GIT_BRANCH] == "master" assert tags[MLFLOW_GIT_REPO_URL] == local_git_repo_uri assert tags[LEGACY_MLFLOW_GIT_BRANCH_NAME] == "master" assert tags[LEGACY_MLFLOW_GIT_REPO_URL] == local_git_repo_uri
def cancel(self): with self._status_lock: if not RunStatus.is_terminated(self._status): _logger.info("Cancelling job.") kube_api = kubernetes.client.BatchV1Api() kube_api.delete_namespaced_job( name=self._job_name, namespace=self._job_namespace, body=kubernetes.client.V1DeleteOptions(), pretty=True) self._status = RunStatus.KILLED _logger.info("Job cancelled.") else: _logger.info( "Attempting to cancel a job that is already terminated.")
def _read_persisted_run_info_dict(run_info_dict): dict_copy = run_info_dict.copy() if 'lifecycle_stage' not in dict_copy: dict_copy['lifecycle_stage'] = LifecycleStage.ACTIVE # 'status' is stored as an integer enum in meta file, but RunInfo.status field is a string. # converting to string before hydrating RunInfo. # If 'status' value not recorded in files, mark it as 'RUNNING' (default) dict_copy['status'] = RunStatus.to_string( run_info_dict.get('status', RunStatus.RUNNING)) # 'experiment_id' was changed from int to string, so we must cast to string # when reading legacy run_infos if isinstance(dict_copy["experiment_id"], int): dict_copy["experiment_id"] = str(dict_copy["experiment_id"]) return RunInfo.from_dictionary(dict_copy)
def _make_persisted_run_info_dict(run_info): # 'tags' was moved from RunInfo to RunData, so we must keep storing it in the meta.yaml for # old mlflow versions to read run_info_dict = dict(run_info) run_info_dict['tags'] = [] run_info_dict['name'] = '' if 'status' in run_info_dict: # 'status' is stored as an integer enum in meta file, but RunInfo.status field is a string. # Convert from string to enum/int before storing. run_info_dict['status'] = RunStatus.from_string(run_info.status) else: run_info_dict['status'] = RunStatus.RUNNING run_info_dict['source_type'] = SourceType.LOCAL run_info_dict['source_name'] = '' run_info_dict['entry_point_name'] = '' run_info_dict['source_version'] = '' return run_info_dict
def _update_status(self, kube_api=kubernetes.client.BatchV1Api()): api_response = kube_api.read_namespaced_job_status( name=self._job_name, namespace=self._job_namespace, pretty=True) status = api_response.status with self._status_lock: if RunStatus.is_terminated(self._status): return self._status if self._status == RunStatus.SCHEDULED: if api_response.status.start_time is None: _logger.info("Waiting for Job to start") else: _logger.info("Job started.") self._status = RunStatus.RUNNING if status.conditions is not None: for condition in status.conditions: if condition.status == "True": _logger.info(condition.message) if condition.type == "Failed": self._status = RunStatus.FAILED elif condition.type == "Complete": self._status = RunStatus.FINISHED return self._status
def create_run(self, experiment_id, user_id, start_time, tags): """ Creates a run with the specified attributes. """ experiment_id = FileStore.DEFAULT_EXPERIMENT_ID if experiment_id is None else experiment_id experiment = self.get_experiment(experiment_id) if experiment is None: raise MlflowException( "Could not create run under experiment with ID %s - no such experiment " "exists." % experiment_id, databricks_pb2.RESOURCE_DOES_NOT_EXIST) if experiment.lifecycle_stage != LifecycleStage.ACTIVE: raise MlflowException( "Could not create run under non-active experiment with ID " "%s." % experiment_id, databricks_pb2.INVALID_STATE) run_uuid = uuid.uuid4().hex artifact_uri = self._get_artifact_dir(experiment_id, run_uuid) run_info = RunInfo(run_uuid=run_uuid, run_id=run_uuid, experiment_id=experiment_id, artifact_uri=artifact_uri, user_id=user_id, status=RunStatus.to_string(RunStatus.RUNNING), start_time=start_time, end_time=None, lifecycle_stage=LifecycleStage.ACTIVE) # Persist run metadata and create directories for logging metrics, parameters, artifacts run_dir = self._get_run_dir(run_info.experiment_id, run_info.run_id) mkdir(run_dir) run_info_dict = _make_persisted_run_info_dict(run_info) write_yaml(run_dir, FileStore.META_DATA_FILE_NAME, run_info_dict) mkdir(run_dir, FileStore.METRICS_FOLDER_NAME) mkdir(run_dir, FileStore.PARAMS_FOLDER_NAME) mkdir(run_dir, FileStore.ARTIFACTS_FOLDER_NAME) for tag in tags: self.set_tag(run_uuid, tag) return self.get_run(run_id=run_uuid)
def test_run(use_start_run): submitted_run = kiwi.projects.run( TEST_PROJECT_DIR, entry_point="test_tracking", parameters={"use_start_run": use_start_run}, use_conda=False, experiment_id=FileStore.DEFAULT_EXPERIMENT_ID) assert submitted_run.run_id is not None # Blocking runs should be finished when they return validate_exit_status(submitted_run.get_status(), RunStatus.FINISHED) # Test that we can call wait() on a synchronous run & that the run has the correct # status after calling wait(). submitted_run.wait() validate_exit_status(submitted_run.get_status(), RunStatus.FINISHED) # Validate run contents in the FileStore run_id = submitted_run.run_id mlflow_service = kiwi.tracking.MlflowClient() run_infos = mlflow_service.list_run_infos( experiment_id=FileStore.DEFAULT_EXPERIMENT_ID, run_view_type=ViewType.ACTIVE_ONLY) assert len(run_infos) == 1 store_run_id = run_infos[0].run_id assert run_id == store_run_id run = mlflow_service.get_run(run_id) assert run.info.status == RunStatus.to_string(RunStatus.FINISHED) assert run.data.params == {"use_start_run": use_start_run} assert run.data.metrics == {"some_key": 3} tags = run.data.tags assert tags[MLFLOW_USER] == MOCK_USER assert "file:" in tags[MLFLOW_SOURCE_NAME] assert tags[MLFLOW_SOURCE_TYPE] == SourceType.to_string(SourceType.PROJECT) assert tags[MLFLOW_PROJECT_ENTRY_POINT] == "test_tracking"
def __exit__(self, exc_type, exc_val, exc_tb): status = RunStatus.FINISHED if exc_type is None else RunStatus.FAILED end_run(RunStatus.to_string(status)) return exc_type is None
def validate_exit_status(status_str, expected): assert RunStatus.from_string(status_str) == expected
class SqlRun(Base): """ DB model for :py:class:`mlflow.entities.Run`. These are recorded in ``runs`` table. """ __tablename__ = 'runs' run_uuid = Column(String(32), nullable=False) """ Run UUID: `String` (limit 32 characters). *Primary Key* for ``runs`` table. """ name = Column(String(250)) """ Run name: `String` (limit 250 characters). """ source_type = Column(String(20), default=SourceType.to_string(SourceType.LOCAL)) """ Source Type: `String` (limit 20 characters). Can be one of ``NOTEBOOK``, ``JOB``, ``PROJECT``, ``LOCAL`` (default), or ``UNKNOWN``. """ source_name = Column(String(500)) """ Name of source recording the run: `String` (limit 500 characters). """ entry_point_name = Column(String(50)) """ Entry-point name that launched the run run: `String` (limit 50 characters). """ user_id = Column(String(256), nullable=True, default=None) """ User ID: `String` (limit 256 characters). Defaults to ``null``. """ status = Column(String(20), default=RunStatus.to_string(RunStatus.SCHEDULED)) """ Run Status: `String` (limit 20 characters). Can be one of ``RUNNING``, ``SCHEDULED`` (default), ``FINISHED``, ``FAILED``. """ start_time = Column(BigInteger, default=int(time.time())) """ Run start time: `BigInteger`. Defaults to current system time. """ end_time = Column(BigInteger, nullable=True, default=None) """ Run end time: `BigInteger`. """ source_version = Column(String(50)) """ Source version: `String` (limit 50 characters). """ lifecycle_stage = Column(String(20), default=LifecycleStage.ACTIVE) """ Lifecycle Stage of run: `String` (limit 32 characters). Can be either ``active`` (default) or ``deleted``. """ artifact_uri = Column(String(200), default=None) """ Default artifact location for this run: `String` (limit 200 characters). """ experiment_id = Column(Integer, ForeignKey('experiments.experiment_id')) """ Experiment ID to which this run belongs to: *Foreign Key* into ``experiment`` table. """ experiment = relationship('SqlExperiment', backref=backref('runs', cascade='all')) """ SQLAlchemy relationship (many:one) with :py:class:`mlflow.store.dbmodels.models.SqlExperiment`. """ __table_args__ = (CheckConstraint(source_type.in_(SourceTypes), name='source_type'), CheckConstraint(status.in_(RunStatusTypes), name='status'), CheckConstraint(lifecycle_stage.in_( LifecycleStage.view_type_to_stages(ViewType.ALL)), name='runs_lifecycle_stage'), PrimaryKeyConstraint('run_uuid', name='run_pk')) @staticmethod def get_attribute_name(mlflow_attribute_name): """ Resolves an MLflow attribute name to a `SqlRun` attribute name. """ # Currently, MLflow Search attributes defined in `SearchUtils.VALID_SEARCH_ATTRIBUTE_KEYS` # share the same names as their corresponding `SqlRun` attributes. Therefore, this function # returns the same attribute name return mlflow_attribute_name def to_mlflow_entity(self): """ Convert DB model to corresponding MLflow entity. :return: :py:class:`mlflow.entities.Run`. """ run_info = RunInfo(run_uuid=self.run_uuid, run_id=self.run_uuid, experiment_id=str(self.experiment_id), user_id=self.user_id, status=self.status, start_time=self.start_time, end_time=self.end_time, lifecycle_stage=self.lifecycle_stage, artifact_uri=self.artifact_uri) run_data = RunData( metrics=[m.to_mlflow_entity() for m in self.latest_metrics], params=[p.to_mlflow_entity() for p in self.params], tags=[t.to_mlflow_entity() for t in self.tags]) return Run(run_info=run_info, run_data=run_data)
def test_is_terminated(self): self.assertTrue(RunStatus.is_terminated(RunStatus.FAILED)) self.assertTrue(RunStatus.is_terminated(RunStatus.FINISHED)) self.assertTrue(RunStatus.is_terminated(RunStatus.KILLED)) self.assertFalse(RunStatus.is_terminated(RunStatus.SCHEDULED)) self.assertFalse(RunStatus.is_terminated(RunStatus.RUNNING))
def test_status_mappings(self): # test enum to string mappings self.assertEqual("RUNNING", RunStatus.to_string(RunStatus.RUNNING)) self.assertEqual(RunStatus.RUNNING, RunStatus.from_string("RUNNING")) self.assertEqual("SCHEDULED", RunStatus.to_string(RunStatus.SCHEDULED)) self.assertEqual(RunStatus.SCHEDULED, RunStatus.from_string("SCHEDULED")) self.assertEqual("FINISHED", RunStatus.to_string(RunStatus.FINISHED)) self.assertEqual(RunStatus.FINISHED, RunStatus.from_string("FINISHED")) self.assertEqual("FAILED", RunStatus.to_string(RunStatus.FAILED)) self.assertEqual(RunStatus.FAILED, RunStatus.from_string("FAILED")) self.assertEqual("KILLED", RunStatus.to_string(RunStatus.KILLED)) self.assertEqual(RunStatus.KILLED, RunStatus.from_string("KILLED")) with self.assertRaises(Exception) as e: RunStatus.to_string(-120) self.assertIn("Could not get string corresponding to run status -120", str(e.exception)) with self.assertRaises(Exception) as e: RunStatus.from_string("the IMPOSSIBLE status string") self.assertIn( "Could not get run status corresponding to string the IMPO", str(e.exception))
def wait(self): kube_api = kubernetes.client.BatchV1Api() while not RunStatus.is_terminated(self._update_status(kube_api)): time.sleep(self.POLL_STATUS_INTERVAL) return self._status == RunStatus.FINISHED
BigInteger, PrimaryKeyConstraint, Boolean) from kiwi.entities import (Experiment, RunTag, Metric, Param, RunData, RunInfo, SourceType, RunStatus, Run, ViewType, ExperimentTag) from kiwi.entities.lifecycle_stage import LifecycleStage from kiwi.store.db.base_sql_model import Base SourceTypes = [ SourceType.to_string(SourceType.NOTEBOOK), SourceType.to_string(SourceType.JOB), SourceType.to_string(SourceType.LOCAL), SourceType.to_string(SourceType.UNKNOWN), SourceType.to_string(SourceType.PROJECT) ] RunStatusTypes = [ RunStatus.to_string(RunStatus.SCHEDULED), RunStatus.to_string(RunStatus.FAILED), RunStatus.to_string(RunStatus.FINISHED), RunStatus.to_string(RunStatus.RUNNING), RunStatus.to_string(RunStatus.KILLED) ] class SqlExperiment(Base): """ DB model for :py:class:`mlflow.entities.Experiment`. These are recorded in ``experiment`` table. """ __tablename__ = 'experiments' experiment_id = Column(Integer, autoincrement=True) """
def get_status(self): status = self._status return status if RunStatus.is_terminated( status) else self._update_status()
def get_status(self): return RunStatus.to_string(self._get_status())
def get_status(self, databricks_run_id): return RunStatus.to_string(self._get_status(databricks_run_id))