def _check_directory(directory: str, if_exists: str) -> str: if os.path.exists(directory): if if_exists == 'error': raise ValueError('directory {} already exists.'.format(directory)) elif if_exists == 'replace': warnings.warn( 'directory {} already exists. It will be replaced by the new result' .format(directory)) existing_run_id = _try_to_get_existing_mlflow_run_id(directory) if existing_run_id is not None: requires_mlflow() import mlflow mlflow.delete_run(existing_run_id) shutil.rmtree(directory, ignore_errors=True) elif if_exists == 'rename': postfix_index = 1 while os.path.exists(directory + '_' + str(postfix_index)): postfix_index += 1 directory += '_' + str(postfix_index) warnings.warn( 'directory is renamed to {} because the original directory already exists.' .format(directory)) return directory
def finalize(): if args.local_rank not in [-1, 0,]: # Make sure only the first process in distributed training will download model & vocab torch.distributed.barrier() if args.local_rank in [-1, 0] and args.n_epochs > 0: try: # On the main process: rename the last checkpoint # (for easy re-loading with from_pretrained method) os.rename(checkpoint_handler._saved[-1][1][-1], os.path.join(args.logdir, WEIGHTS_NAME)) if args.use_mlflow: mlflow.log_artifact(args.logdir / WEIGHTS_NAME, "training") logger.info("ending mlflow run") logger.info(f"run_id: {mlflow.active_run().info.run_id}") mlflow.end_run() rmtree(args.logdir) except: logger.info("No checkpoint to finalize the model. Deleting run") # TODO: fix issue in mlflow trying to delete the experiment multiple times mlflow.delete_run(mlflow.active_run().info.run_id) rmtree(args.logdir) if args.local_rank == 0: torch.distributed.barrier()
def ignore_and_delete_unfinished(df: DataFrame) -> DataFrame: """ Find and delete 'unfinished' experiments, returning the clean df of finished experiments. :param df: (DataFrame) all existing experiments :return: (DataFrame) only finished experiments """ delete_mask = df['metrics.percentage_infected'] != 0.0 delete_df = df[delete_mask] if len(delete_df) > 0: print('There are {} experiments to be deleted'.format(len(delete_df))) for run_id in delete_df['run_id']: mlflow.delete_run(run_id=run_id) return df[~delete_mask]
def delete_unfinished_experiments(): """ Find all unfinished experiments (with n_infected != 0) and delete them. :return: None """ print('Loading experiments ...') mlflow.set_experiment(GlobalConfig().experiment_name) df = mlflow.search_runs(experiment_ids=GlobalConfig().experiment_id) print('... found {} experiments.'.format(len(df))) # Filter to keep only those that were not completed df = df[df['metrics.percentage_infected'] != 0.0] # Delete the selected experiments print('There are {} experiments to be deleted'.format(len(df))) for run_id in tqdm(df['run_id']): mlflow.delete_run(run_id=run_id)
def remove_old_models( self, experiment_name: str, max_n_models: int = 10, artifact_folder: str = None ): """Remove old models per experiment. Note: This functionality is not incorporated in MLFlow natively See also: https://github.com/mlflow/mlflow/issues/2152""" if max_n_models < 1: raise ValueError( f"Max models to keep should be greater than 1! Received: {max_n_models}" ) previous_runs = self._find_models(experiment_name=experiment_name) if len(previous_runs) > max_n_models: self.logger.debug( f"Going to delete old models. {len(previous_runs)} > {max_n_models}" ) # Find run_ids of oldest runs runs_to_remove = previous_runs.sort_values( by="end_time", ascending=False ).loc[max_n_models:, :] for _, run in runs_to_remove.iterrows(): self.logger.debug( f"Going to remove run {run.run_id}, from {run.end_time}." ) mlflow.delete_run(run.run_id) self.logger.debug("Removed run") # mlflow.delete_run only marks it as deleted but does not delete it by itself if artifact_folder: # Also try to remove artifact from disk. artifact_filepath = ( f"{artifact_folder}/mlruns/{run.experiment_id}/{run.run_id}" ) self.logger.debug(f"Removing artifact: {artifact_filepath}") try: shutil.rmtree(artifact_filepath) self.logger.debug("Removed artifact") except Exception as e: self.logger.info(f"Failed removing artifacts: {e}")
if __name__ == "__main__": warnings.filterwarnings("ignore") print(mlflow.__version__) # Create two runs with mlflow.start_run() as run1: mlflow.log_param("p", 0) mlflow.log_metric("click_rate", 1.55) with mlflow.start_run() as run2: mlflow.log_param("p", 0) mlflow.log_metric("click_rate", 2.50) # Delete the last run mlflow.delete_run(run2.info.run_id) def print_run_infos(run_infos): for r in run_infos: print("- run_id: {}, lifecycle_stage: {}".format( r.run_id, r.lifecycle_stage)) print("Active runs:") print_run_infos( mlflow.list_run_infos("0", run_view_type=ViewType.ACTIVE_ONLY)) print("Deleted runs:") print_run_infos( mlflow.list_run_infos("0", run_view_type=ViewType.DELETED_ONLY)) print("All runs:")
# # Code snippet for https://mlflow.org/docs/latest/python_api/mlflow.html#delete_run # import warnings import mlflow if __name__ == "__main__": warnings.filterwarnings("ignore") print(mlflow.__version__) with mlflow.start_run() as run: mlflow.log_param("p", 0) run_id = run.info.run_id mlflow.delete_run(run_id) print("run_id: {}; lifecycle_stage: {}".format( run_id, mlflow.get_run(run_id).info.lifecycle_stage))
def delete_run(self, run_id): return mlflow.delete_run(run_id)