def _create_run(uri, experiment_id, work_dir, entry_point): """ Create a ``Run`` against the current MLflow tracking server, logging metadata (e.g. the URI, entry point, and parameters of the project) about the run. Return an ``ActiveRun`` that can be used to report additional data about the run (metrics/params) to the tracking server. """ if _is_local_uri(uri): source_name = tracking.utils._get_git_url_if_present(_expand_uri(uri)) else: source_name = _expand_uri(uri) source_version = _get_git_commit(work_dir) existing_run = fluent.active_run() if existing_run: parent_run_id = existing_run.info.run_id else: parent_run_id = None tags = { MLFLOW_USER: _get_user(), MLFLOW_SOURCE_NAME: source_name, MLFLOW_SOURCE_TYPE: SourceType.to_string(SourceType.PROJECT), MLFLOW_PROJECT_ENTRY_POINT: entry_point } if source_version is not None: tags[MLFLOW_GIT_COMMIT] = source_version if parent_run_id is not None: tags[MLFLOW_PARENT_RUN_ID] = parent_run_id active_run = tracking.MlflowClient().create_run(experiment_id=experiment_id, tags=tags) return active_run
def _get_docker_tag_name(imagename, work_dir): """Returns an appropriate Docker tag for a project based on name and git hash.""" imagename = imagename if imagename else "docker-project" # Optionally include first 7 digits of git SHA in tag name, if available. git_commit = _get_git_commit(work_dir) version_string = ":" + git_commit[:7] if git_commit else "" return imagename + version_string
def _create_run(uri, experiment_id, work_dir, version, entry_point, parameters): """ Create a ``Run`` against the current MLflow tracking server, logging metadata (e.g. the URI, entry point, and parameters of the project) about the run. Return an ``ActiveRun`` that can be used to report additional data about the run (metrics/params) to the tracking server. """ if _is_local_uri(uri): source_name = tracking._tracking_service.utils._get_git_url_if_present( _expand_uri(uri)) else: source_name = _expand_uri(uri) source_version = _get_git_commit(work_dir) existing_run = fluent.active_run() if existing_run: parent_run_id = existing_run.info.run_id else: parent_run_id = None tags = { MLFLOW_USER: _get_user(), MLFLOW_SOURCE_NAME: source_name, MLFLOW_SOURCE_TYPE: SourceType.to_string(SourceType.PROJECT), MLFLOW_PROJECT_ENTRY_POINT: entry_point, } if source_version is not None: tags[MLFLOW_GIT_COMMIT] = source_version if parent_run_id is not None: tags[MLFLOW_PARENT_RUN_ID] = parent_run_id repo_url = _get_git_repo_url(work_dir) if repo_url is not None: tags[MLFLOW_GIT_REPO_URL] = repo_url tags[LEGACY_MLFLOW_GIT_REPO_URL] = repo_url # Add branch name tag if a branch is specified through -version if _is_valid_branch_name(work_dir, version): tags[MLFLOW_GIT_BRANCH] = version tags[LEGACY_MLFLOW_GIT_BRANCH_NAME] = version active_run = tracking.MlflowClient().create_run( experiment_id=experiment_id, tags=tags) project = _project_spec.load_project(work_dir) # Consolidate parameters for logging. # `storage_dir` is `None` since we want to log actual path not downloaded local path entry_point_obj = project.get_entry_point(entry_point) final_params, extra_params = entry_point_obj.compute_parameters( parameters, storage_dir=None) params_list = [ Param(key, value) for key, value in list(final_params.items()) + list(extra_params.items()) ] tracking.MlflowClient().log_batch(active_run.info.run_id, params=params_list) return active_run
def _get_docker_image_uri(repository_uri, work_dir): """ Returns an appropriate Docker image URI for a project based on the git hash of the specified working directory. :param repository_uri: The URI of the Docker repository with which to tag the image. The repository URI is used as the prefix of the image URI. :param work_dir: Path to the working directory in which to search for a git commit hash """ repository_uri = repository_uri if repository_uri else "docker-project" # Optionally include first 7 digits of git SHA in tag name, if available. git_commit = _get_git_commit(work_dir) version_string = ":" + git_commit[:7] if git_commit else "" return repository_uri + version_string
def main(cfg: DictConfig) -> None: # set up mlflow experiment id mlflow.set_tracking_uri(f"file://{to_absolute_path(cfg.path_to_mlflow)}") experiment = mlflow.get_experiment_by_name(cfg.experiment_name) if experiment is not None: run_kwargs = {'experiment_id': experiment.experiment_id} if cfg["pretrained"] is not None: # initialise with pretrained run, otherwise create a new run run_kwargs['run_id'] = cfg["pretrained"]["run_id"] else: # create new experiment experiment_id = mlflow.create_experiment(cfg.experiment_name) run_kwargs = {'experiment_id': experiment_id} # run the training with mlflow tracking with mlflow.start_run(**run_kwargs) as main_run: if cfg["pretrained"] is not None: mlflow.start_run(experiment_id=run_kwargs['experiment_id'], nested=True) active_run = mlflow.active_run() run_id = active_run.info.run_id setup_gpu(cfg.gpu_cfg) training_cfg = OmegaConf.to_object( cfg.training_cfg) # convert to python dictionary scaling_cfg = to_absolute_path(cfg.scaling_cfg) dataloader = DataLoader.DataLoader(training_cfg, scaling_cfg) setup = dataloader.config["SetupNN"] TauLosses.SetSFs(*setup["TauLossesSFs"]) print("loss consts:", TauLosses.Le_sf, TauLosses.Lmu_sf, TauLosses.Ltau_sf, TauLosses.Ljet_sf) if setup["using_new_loss"]: tf.config.run_functions_eagerly(True) netConf_full = dataloader.get_net_config() if dataloader.input_type == "Adversarial": model = create_model( netConf_full, dataloader.model_name, loss=setup["loss"], use_newloss=setup["using_new_loss"], use_AdvDataset=True, adv_param=dataloader.adversarial_parameter, n_adv_tau=dataloader.adv_batch_size, adv_learning_rate=dataloader.adv_learning_rate) else: model = create_model(netConf_full, dataloader.model_name, loss=setup["loss"], use_newloss=setup["using_new_loss"]) if cfg.pretrained is None: print( "Warning: no pretrained NN -> training will be started from scratch" ) old_opt = None else: print("Warning: training will be started from pretrained model.") print( f"Model: run_id={cfg.pretrained.run_id}, experiment_id={cfg.pretrained.experiment_id}, model={cfg.pretrained.starting_model}" ) path_to_pretrain = to_absolute_path( f'{cfg.path_to_mlflow}/{cfg.pretrained.experiment_id}/{cfg.pretrained.run_id}/artifacts/' ) old_model = load_model( path_to_pretrain + f"/model_checkpoints/{cfg.pretrained.starting_model}", compile=False, custom_objects=None) for layer in model.layers: weights_found = False for old_layer in old_model.layers: if layer.name == old_layer.name: layer.set_weights(old_layer.get_weights()) weights_found = True break if not weights_found: print(f"Weights for layer '{layer.name}' not found.") old_opt = old_model.optimizer old_vars = [var.name for var in old_model.trainable_variables] compile_model(model, setup["optimizer_name"], setup["learning_rate"], setup["metrics"], setup["schedule_decay"]) fit_hist = run_training(model, dataloader, False, cfg.log_suffix, setup["using_new_loss"], old_opt=old_opt) # log NN params for net_type in [ 'tau_net', 'comp_net', 'comp_merge_net', 'conv_2d_net', 'dense_net' ]: mlflow.log_params({ f'{net_type}_{k}': v for k, v in cfg.training_cfg.SetupNN[net_type].items() }) mlflow.log_params({ f'TauLossesSFs_{i}': v for i, v in enumerate(cfg.training_cfg.SetupNN.TauLossesSFs) }) with open( to_absolute_path( f'{cfg.path_to_mlflow}/{run_kwargs["experiment_id"]}/{run_id}/artifacts/model_summary.txt' )) as f: for l in f: if (s := 'Trainable params: ') in l: mlflow.log_param('n_train_params', int(l.split(s)[-1].replace(',', ''))) # log training related files mlflow.log_dict(training_cfg, 'input_cfg/training_cfg.yaml') mlflow.log_artifact(scaling_cfg, 'input_cfg') mlflow.log_artifact(to_absolute_path("Training_CNN.py"), 'input_cfg') mlflow.log_artifact(to_absolute_path("common.py"), 'input_cfg') # log hydra files mlflow.log_artifacts('.hydra', 'input_cfg/hydra') mlflow.log_artifact('Training_CNN.log', 'input_cfg/hydra') # log misc. info mlflow.log_param('run_id', run_id) mlflow.log_param('git_commit', _get_git_commit(to_absolute_path('.'))) print( f'\nTraining has finished! Corresponding MLflow experiment name (ID): {cfg.experiment_name}({run_kwargs["experiment_id"]}), and run ID: {run_id}\n' ) mlflow.end_run() # Temporary workaround to kill additional subprocesses that have not exited correctly try: current_process = psutil.Process() children = current_process.children(recursive=True) for child in children: child.kill() except: pass