def _create_child_runs_for_parameter_search(parent_estimator, parent_model, parent_run, child_tags): from itertools import zip_longest client = MlflowClient() # Use the start time of the parent parameter search run as a rough estimate for the # start time of child runs, since we cannot precisely determine when each point # in the parameter search space was explored child_run_start_time = parent_run.info.start_time child_run_end_time = int(time.time() * 1000) estimator_param_maps = parent_estimator.getEstimatorParamMaps() tuned_estimator = parent_estimator.getEstimator() metrics_dict, _ = _get_param_search_metrics_and_best_index( parent_estimator, parent_model) for i in range(len(estimator_param_maps)): child_estimator = tuned_estimator.copy(estimator_param_maps[i]) tags_to_log = dict(child_tags) if child_tags else {} tags_to_log.update({MLFLOW_PARENT_RUN_ID: parent_run.info.run_id}) tags_to_log.update(_get_estimator_info_tags(child_estimator)) child_run = client.create_run( experiment_id=parent_run.info.experiment_id, start_time=child_run_start_time, tags=tags_to_log, ) params_to_log = _get_instance_param_map( child_estimator, parent_estimator._autologging_metadata.uid_to_indexed_name_map) param_batches_to_log = _chunk_dict( params_to_log, chunk_size=MAX_PARAMS_TAGS_PER_BATCH) metrics_to_log = {k: v[i] for k, v in metrics_dict.items()} for params_batch, metrics_batch in zip_longest(param_batches_to_log, [metrics_to_log], fillvalue={}): # Trim any parameter keys / values and metric keys that exceed the limits # imposed by corresponding MLflow Tracking APIs (e.g., LogParam, LogMetric) truncated_params_batch = _truncate_dict(params_batch, MAX_ENTITY_KEY_LENGTH, MAX_PARAM_VAL_LENGTH) truncated_metrics_batch = _truncate_dict( metrics_batch, max_key_length=MAX_ENTITY_KEY_LENGTH) client.log_batch( run_id=child_run.info.run_id, params=[ Param(str(key), str(value)) for key, value in truncated_params_batch.items() ], metrics=[ Metric(key=str(key), value=value, timestamp=child_run_end_time, step=0) for key, value in truncated_metrics_batch.items() ], ) client.set_terminated(run_id=child_run.info.run_id, end_time=child_run_end_time)
class MlflowAutologgingQueueingClient: """ Efficiently implements a subset of MLflow Tracking's `MlflowClient` and fluent APIs to provide automatic batching and async execution of run operations by way of queueing, as well as parameter / tag truncation for autologging use cases. Run operations defined by this client, such as `create_run` and `log_metrics`, enqueue data for future persistence to MLflow Tracking. Data is not persisted until the queue is flushed via the `flush()` method, which supports synchronous and asynchronous execution. MlflowAutologgingQueueingClient is not threadsafe; none of its APIs should be called concurrently. """ def __init__(self, tracking_uri=None): self._client = MlflowClient(tracking_uri) self._pending_ops_by_run_id = {} def __enter__(self): """ Enables `MlflowAutologgingQueueingClient` to be used as a context manager with synchronous flushing upon exit, removing the need to call `flush()` for use cases where logging completion can be waited upon synchronously. Run content is only flushed if the context exited without an exception. """ return self def __exit__(self, exc_type, exc, traceback): # pylint: disable=unused-argument """ Enables `MlflowAutologgingQueueingClient` to be used as a context manager with synchronous flushing upon exit, removing the need to call `flush()` for use cases where logging completion can be waited upon synchronously. Run content is only flushed if the context exited without an exception. """ # NB: Run content is only flushed upon context exit to ensure that we don't elide the # original exception thrown by the context (because `flush()` itself may throw). This # is consistent with the behavior of a routine that calls `flush()` explicitly: content # is not logged if an exception preempts the call to `flush()` if exc is None and exc_type is None and traceback is None: self.flush(synchronous=True) else: _logger.debug( "Skipping run content logging upon MlflowAutologgingQueueingClient context because" " an exception was raised within the context: %s", exc, ) def create_run( self, experiment_id: str, start_time: Optional[int] = None, tags: Optional[Dict[str, Any]] = None, ) -> PendingRunId: """ Enqueues a CreateRun operation with the specified attributes, returning a `PendingRunId` instance that can be used as input to other client logging APIs (e.g. `log_metrics`, `log_params`, ...). :return: A `PendingRunId` that can be passed as the `run_id` parameter to other client logging APIs, such as `log_params` and `log_metrics`. """ tags = tags or {} tags = _truncate_dict(tags, max_key_length=MAX_ENTITY_KEY_LENGTH, max_value_length=MAX_TAG_VAL_LENGTH) run_id = PendingRunId() self._get_pending_operations(run_id).enqueue( create_run=_PendingCreateRun( experiment_id=experiment_id, start_time=start_time, tags=[RunTag(key, str(value)) for key, value in tags.items()], )) return run_id def set_terminated( self, run_id: Union[str, PendingRunId], status: Optional[str] = None, end_time: Optional[int] = None, ) -> None: """ Enqueues an UpdateRun operation with the specified `status` and `end_time` attributes for the specified `run_id`. """ self._get_pending_operations(run_id).enqueue( set_terminated=_PendingSetTerminated(status=status, end_time=end_time)) def log_params(self, run_id: Union[str, PendingRunId], params: Dict[str, Any]) -> None: """ Enqueues a collection of Parameters to be logged to the run specified by `run_id`. """ params = _truncate_dict(params, max_key_length=MAX_ENTITY_KEY_LENGTH, max_value_length=MAX_PARAM_VAL_LENGTH) params_arr = [Param(key, str(value)) for key, value in params.items()] self._get_pending_operations(run_id).enqueue(params=params_arr) def log_metrics( self, run_id: Union[str, PendingRunId], metrics: Dict[str, float], step: Optional[int] = None, ) -> None: """ Enqueues a collection of Metrics to be logged to the run specified by `run_id` at the step specified by `step`. """ metrics = _truncate_dict(metrics, max_key_length=MAX_ENTITY_KEY_LENGTH) timestamp_ms = int(time.time() * 1000) metrics_arr = [ Metric(key, value, timestamp_ms, step or 0) for key, value in metrics.items() ] self._get_pending_operations(run_id).enqueue(metrics=metrics_arr) def set_tags(self, run_id: Union[str, PendingRunId], tags: Dict[str, Any]) -> None: """ Enqueues a collection of Tags to be logged to the run specified by `run_id`. """ tags = _truncate_dict(tags, max_key_length=MAX_ENTITY_KEY_LENGTH, max_value_length=MAX_TAG_VAL_LENGTH) tags_arr = [RunTag(key, str(value)) for key, value in tags.items()] self._get_pending_operations(run_id).enqueue(tags=tags_arr) def flush(self, synchronous=True): """ Flushes all queued run operations, resulting in the creation or mutation of runs and run data. :param synchronous: If `True`, run operations are performed synchronously, and a `RunOperations` result object is only returned once all operations are complete. If `False`, run operations are performed asynchronously, and an `RunOperations` object is returned that represents the ongoing run operations. :return: A `RunOperations` instance representing the flushed operations. These operations are already complete if `synchronous` is `True`. If `synchronous` is `False`, these operations may still be inflight. Operation completion can be synchronously waited on via `RunOperations.await_completion()`. """ logging_futures = [] for pending_operations in self._pending_ops_by_run_id.values(): future = _AUTOLOGGING_QUEUEING_CLIENT_THREAD_POOL.submit( self._flush_pending_operations, pending_operations=pending_operations, ) logging_futures.append(future) self._pending_ops_by_run_id = {} logging_operations = RunOperations(logging_futures) if synchronous: logging_operations.await_completion() return logging_operations def _get_pending_operations(self, run_id): """ :return: A `_PendingRunOperations` containing all pending operations for the specified `run_id`. """ if run_id not in self._pending_ops_by_run_id: self._pending_ops_by_run_id[run_id] = _PendingRunOperations( run_id=run_id) return self._pending_ops_by_run_id[run_id] def _try_operation(self, fn, *args, **kwargs): """ Attempt to evaluate the specified function, `fn`, on the specified `*args` and `**kwargs`, returning either the result of the function evaluation (if evaluation was successful) or the exception raised by the function evaluation (if evaluation was unsuccessful). """ try: return fn(*args, **kwargs) except Exception as e: return e def _flush_pending_operations(self, pending_operations): """ Synchronously and sequentially flushes the specified list of pending run operations. NB: Operations are not parallelized on a per-run basis because MLflow's File Store, which is frequently used for local ML development, does not support threadsafe metadata logging within a given run. """ if pending_operations.create_run: create_run_tags = pending_operations.create_run.tags num_additional_tags_to_include_during_creation = MAX_ENTITIES_PER_BATCH - len( create_run_tags) if num_additional_tags_to_include_during_creation > 0: create_run_tags.extend( pending_operations. tags_queue[:num_additional_tags_to_include_during_creation] ) pending_operations.tags_queue = pending_operations.tags_queue[ num_additional_tags_to_include_during_creation:] new_run = self._client.create_run( experiment_id=pending_operations.create_run.experiment_id, start_time=pending_operations.create_run.start_time, tags={tag.key: tag.value for tag in create_run_tags}, ) pending_operations.run_id = new_run.info.run_id run_id = pending_operations.run_id assert not isinstance( run_id, PendingRunId), "Run ID cannot be pending for logging" operation_results = [] param_batches_to_log = chunk_list( pending_operations.params_queue, chunk_size=MAX_PARAMS_TAGS_PER_BATCH, ) tag_batches_to_log = chunk_list( pending_operations.tags_queue, chunk_size=MAX_PARAMS_TAGS_PER_BATCH, ) for params_batch, tags_batch in zip_longest(param_batches_to_log, tag_batches_to_log, fillvalue=[]): metrics_batch_size = min( MAX_ENTITIES_PER_BATCH - len(params_batch) - len(tags_batch), MAX_METRICS_PER_BATCH, ) metrics_batch_size = max(metrics_batch_size, 0) metrics_batch = pending_operations.metrics_queue[: metrics_batch_size] pending_operations.metrics_queue = pending_operations.metrics_queue[ metrics_batch_size:] operation_results.append( self._try_operation( self._client.log_batch, run_id=run_id, metrics=metrics_batch, params=params_batch, tags=tags_batch, )) for metrics_batch in chunk_list(pending_operations.metrics_queue, chunk_size=MAX_METRICS_PER_BATCH): operation_results.append( self._try_operation(self._client.log_batch, run_id=run_id, metrics=metrics_batch)) if pending_operations.set_terminated: operation_results.append( self._try_operation( self._client.set_terminated, run_id=run_id, status=pending_operations.set_terminated.status, end_time=pending_operations.set_terminated.end_time, )) failures = [ result for result in operation_results if isinstance(result, Exception) ] if len(failures) > 0: raise MlflowException(message=( "Failed to perform one or more operations on the run with ID {run_id}." " Failed operations: {failures}".format(run_id=run_id, failures=failures)))
def _create_child_runs_for_parameter_search(cv_estimator, parent_run, child_tags=None): """ Creates a collection of child runs for a parameter search training session. Runs are reconstructed from the `cv_results_` attribute of the specified trained parameter search estimator - `cv_estimator`, which provides relevant performance metrics for each point in the parameter search space. One child run is created for each point in the parameter search space. For additional information, see `https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html`_. # noqa: E501 :param cv_estimator: The trained parameter search estimator for which to create child runs. :param parent_run: A py:class:`mlflow.entities.Run` object referring to the parent parameter search run for which child runs should be created. :param child_tags: An optional dictionary of MLflow tag keys and values to log for each child run. """ import pandas as pd client = MlflowClient() # Use the start time of the parent parameter search run as a rough estimate for the # start time of child runs, since we cannot precisely determine when each point # in the parameter search space was explored child_run_start_time = parent_run.info.start_time child_run_end_time = int(time.time() * 1000) seed_estimator = cv_estimator.estimator # In the unlikely case that a seed of a parameter search estimator is, # itself, a parameter search estimator, we should avoid logging the untuned # parameters of the seeds's seed estimator should_log_params_deeply = not _is_parameter_search_estimator( seed_estimator) # Each row of `cv_results_` only provides parameters that vary across # the user-specified parameter grid. In order to log the complete set # of parameters for each child run, we fetch the parameters defined by # the seed estimator and update them with parameter subset specified # in the result row base_params = seed_estimator.get_params(deep=should_log_params_deeply) cv_results_df = pd.DataFrame.from_dict(cv_estimator.cv_results_) for _, result_row in cv_results_df.iterrows(): tags_to_log = dict(child_tags) if child_tags else {} tags_to_log.update({MLFLOW_PARENT_RUN_ID: parent_run.info.run_id}) tags_to_log.update(_get_estimator_info_tags(seed_estimator)) child_run = client.create_run( experiment_id=parent_run.info.experiment_id, start_time=child_run_start_time, tags=tags_to_log, ) from itertools import zip_longest params_to_log = dict(base_params) params_to_log.update(result_row.get("params", {})) param_batches_to_log = _chunk_dict( params_to_log, chunk_size=MAX_PARAMS_TAGS_PER_BATCH) # Parameters values are recorded twice in the set of search `cv_results_`: # once within a `params` column with dictionary values and once within # a separate dataframe column that is created for each parameter. To prevent # duplication of parameters, we log the consolidated values from the parameter # dictionary column and filter out the other parameter-specific columns with # names of the form `param_{param_name}`. Additionally, `cv_results_` produces # metrics for each training split, which is fairly verbose; accordingly, we filter # out per-split metrics in favor of aggregate metrics (mean, std, etc.) excluded_metric_prefixes = ["param", "split"] metric_batches_to_log = _chunk_dict( { key: value for key, value in result_row.iteritems() if not any([ key.startswith(prefix) for prefix in excluded_metric_prefixes ]) and isinstance(value, Number) }, chunk_size=min(MAX_ENTITIES_PER_BATCH - MAX_PARAMS_TAGS_PER_BATCH, MAX_METRICS_PER_BATCH), ) for params_batch, metrics_batch in zip_longest(param_batches_to_log, metric_batches_to_log, fillvalue={}): # Trim any parameter keys / values and metric keys that exceed the limits # imposed by corresponding MLflow Tracking APIs (e.g., LogParam, LogMetric) truncated_params_batch = _truncate_dict(params_batch, MAX_ENTITY_KEY_LENGTH, MAX_PARAM_VAL_LENGTH) truncated_metrics_batch = _truncate_dict( metrics_batch, max_key_length=MAX_ENTITY_KEY_LENGTH) client.log_batch( run_id=child_run.info.run_id, params=[ Param(str(key), str(value)) for key, value in truncated_params_batch.items() ], metrics=[ Metric(key=str(key), value=value, timestamp=child_run_end_time, step=0) for key, value in truncated_metrics_batch.items() ], ) client.set_terminated(run_id=child_run.info.run_id, end_time=child_run_end_time)
class MLflowWriter(object): def __init__(self, exp_name, save_dir, log_every, **mlflow_cfg): mlflow.set_tracking_uri(save_dir) self.client = MlflowClient(**mlflow_cfg) mlflow.set_experiment(exp_name) self.experiment_id = self.client.get_experiment_by_name( exp_name).experiment_id self.run_id = self.client.create_run(self.experiment_id).info.run_id self.log_every = log_every self.clear() def log_params_from_omegaconf(self, params): self._explore_recursive("", params) def _explore_recursive(self, parent_name, element): if isinstance(element, DictConfig): iterator = element.items() elif isinstance(element, ListConfig): iterator = enumerate(element) for k, v in iterator: if isinstance(v, DictConfig) or isinstance(v, ListConfig): self._explore_recursive(f"{parent_name}{k}.", v) else: self.client.log_param( self.run_id, f"{parent_name}{k}", v) def log_torch_model(self, model, epoch): with mlflow.start_run(self.run_id): mlflow.pytorch.log_model(model, "model_%04d" % epoch) def log_metric(self, key, value, is_training): if isinstance(value, torch.Tensor): value = float(value.detach().cpu().numpy()) metric_name = "train/" if is_training else "valid/" metric_name += str(key) if key in self.metrics: self.metrics[metric_name].append(value) else: self.metrics[metric_name] = [value] def next_iteration(self): self.iterations += 1 if self.iterations % self.log_every == 0: self.toMlflow(nb_data=self.log_every) def toMlflow(self, nb_data=0, step=0): for key, value in self.metrics.items(): self.client.log_metric( self.run_id, key, np.mean(value[-nb_data:]), step=step) def get_mean(self, key, is_training): metric_name = "train/" if is_training else "valid/" metric_name += str(key) return np.mean(self.metrics[metric_name]) def clear(self): self.metrics = {} self.iterations = 0 def log_artifact(self, path): self.client.log_artifact(self.run_id, local_path=path) def terminate(self): self.client.set_terminated(self.run_id)