def submit_to_azureml(self) -> Run: """ Submit a job to AzureML, returning the resulting Run object, or exiting if we were asked to wait for completion and the Run did not succeed. """ # The adal package creates a logging.info line each time it gets an authentication token, avoid that. logging.getLogger('adal-python').setLevel(logging.WARNING) if not self.model_config.azure_dataset_id: raise ValueError( "When running on AzureML, the 'azure_dataset_id' property must be set." ) model_config_overrides = str(self.model_config.overrides) source_config = SourceConfig( root_folder=self.project_root, entry_script=Path(sys.argv[0]).resolve(), conda_dependencies_files=[ get_environment_yaml_file(), self.project_root / fixed_paths.ENVIRONMENT_YAML_FILE_NAME ], hyperdrive_config_func=lambda estimator: self.model_config. get_hyperdrive_config(estimator), # For large jobs, upload of results times out frequently because of large checkpoint files. Default is 600 upload_timeout_seconds=86400, ) source_config.set_script_params_except_submit_flag() assert self.model_config.azure_dataset_id is not None # to stop mypy complaining about next line azure_run = submit_to_azureml(self.azure_config, source_config, model_config_overrides, self.model_config.azure_dataset_id) logging.info("Job submission to AzureML done.") if self.azure_config.pytest_mark: # The AzureML job can optionally run pytest. Attempt to download it to the current directory. # A build step will pick up that file and publish it to Azure DevOps. # If pytest_mark is set, this file must exist. logging.info("Downloading pytest result file.") download_pytest_result(azure_run) else: logging.info( "No pytest_mark present, hence not downloading the pytest result file." ) status = azure_run.get_status() # For PR builds where we wait for job completion, the job must have ended in a COMPLETED state. # If a pytest failed, the runner has exited with code -1 (see below) if self.azure_config.wait_for_completion and status != RunStatus.COMPLETED: logging.error(f"Job completed with status {status}. Exiting.") exit(-1) return azure_run
def after_submission_hook(azure_run: Run) -> None: """ A function that will be called right after job submission. """ # Set the default display name to what was provided as the "tag". This will affect single runs # and Hyperdrive parent runs if self.azure_config.tag: azure_run.display_name = self.azure_config.tag # Add an extra tag that depends on the run that was actually submitted. This is used for later filtering # run in cross validation analysis recovery_id = create_run_recovery_id(azure_run) azure_run.tag(RUN_RECOVERY_ID_KEY_NAME, recovery_id) print( "If this run fails, re-start runner.py and supply these additional arguments: " f"--run_recovery_id={recovery_id}") if self.azure_config.tensorboard: print( "Starting TensorBoard now because you specified --tensorboard" ) monitor(monitor_config=AMLTensorBoardMonitorConfig( run_ids=[azure_run.id]), azure_config=self.azure_config) else: print( f"To monitor this run locally using TensorBoard, run the script: " f"InnerEye/Azure/tensorboard_monitor.py --run_ids={azure_run.id}" ) if self.azure_config.wait_for_completion: # We want the job output to be visible on the console. Do not exit yet if the job fails, because we # may need to download the pytest result file. azure_run.wait_for_completion(show_output=True, raise_on_error=False) if self.azure_config.pytest_mark: # The AzureML job can optionally run pytest. Attempt to download it to the current directory. # A build step will pick up that file and publish it to Azure DevOps. # If pytest_mark is set, this file must exist. logging.info("Downloading pytest result file.") download_pytest_result(azure_run) if azure_run.status == RunStatus.FAILED: raise ValueError( f"The AzureML run failed. Please check this URL for details: " f"{azure_run.get_portal_url()}")
def get_run_and_download_pytest(branch: str, number: int) -> Optional[Path]: experiment = Experiment(workspace, name=to_azure_friendly_string(branch)) runs = [run for run in experiment.get_runs() if run.number == number] if len(runs) != 1: raise ValueError( f"Expected to get exactly 1 run in experiment {experiment.name}" ) return download_pytest_result(runs[0], output_dir)
def submit_to_azureml(self) -> Run: """ Submit a job to AzureML, returning the resulting Run object, or exiting if we were asked to wait for completion and the Run did not succeed. """ # The adal package creates a logging.info line each time it gets an authentication token, avoid that. logging.getLogger('adal-python').setLevel(logging.WARNING) # Azure core prints full HTTP requests even in INFO mode logging.getLogger('azure').setLevel(logging.WARNING) # PyJWT prints out warnings that are beyond our control warnings.filterwarnings("ignore", category=DeprecationWarning) if isinstance(self.model_config, DeepLearningConfig) and not self.lightning_container.azure_dataset_id: raise ValueError("When running an InnerEye built-in model in AzureML, the 'azure_dataset_id' " "property must be set.") hyperdrive_func = lambda run_config: self.model_config.get_hyperdrive_config(run_config) # type: ignore source_config = SourceConfig( root_folder=self.project_root, entry_script=Path(sys.argv[0]).resolve(), conda_dependencies_files=get_all_environment_files(self.project_root), hyperdrive_config_func=hyperdrive_func, # For large jobs, upload of results can time out because of large checkpoint files. Default is 600 upload_timeout_seconds=86400, ) source_config.set_script_params_except_submit_flag() azure_run = submit_to_azureml(self.azure_config, source_config, self.lightning_container.all_azure_dataset_ids(), self.lightning_container.all_dataset_mountpoints()) logging.info("Job submission to AzureML done.") if self.azure_config.pytest_mark and self.azure_config.wait_for_completion: # The AzureML job can optionally run pytest. Attempt to download it to the current directory. # A build step will pick up that file and publish it to Azure DevOps. # If pytest_mark is set, this file must exist. logging.info("Downloading pytest result file.") download_pytest_result(azure_run) else: logging.info("No pytest_mark present, hence not downloading the pytest result file.") # For PR builds where we wait for job completion, the job must have ended in a COMPLETED state. if self.azure_config.wait_for_completion and not is_run_and_child_runs_completed(azure_run): raise ValueError(f"Run {azure_run.id} in experiment {azure_run.experiment.name} or one of its child " "runs failed.") return azure_run