Exemplo n.º 1
0
 def submit_to_azureml(self) -> Run:
     """
     Submit a job to AzureML, returning the resulting Run object, or exiting if we were asked to wait for
     completion and the Run did not succeed.
     """
     # The adal package creates a logging.info line each time it gets an authentication token, avoid that.
     logging.getLogger('adal-python').setLevel(logging.WARNING)
     if not self.model_config.azure_dataset_id:
         raise ValueError(
             "When running on AzureML, the 'azure_dataset_id' property must be set."
         )
     model_config_overrides = str(self.model_config.overrides)
     source_config = SourceConfig(
         root_folder=self.project_root,
         entry_script=Path(sys.argv[0]).resolve(),
         conda_dependencies_files=[
             get_environment_yaml_file(),
             self.project_root / fixed_paths.ENVIRONMENT_YAML_FILE_NAME
         ],
         hyperdrive_config_func=lambda estimator: self.model_config.
         get_hyperdrive_config(estimator),
         # For large jobs, upload of results times out frequently because of large checkpoint files. Default is 600
         upload_timeout_seconds=86400,
     )
     source_config.set_script_params_except_submit_flag()
     assert self.model_config.azure_dataset_id is not None  # to stop mypy complaining about next line
     azure_run = submit_to_azureml(self.azure_config, source_config,
                                   model_config_overrides,
                                   self.model_config.azure_dataset_id)
     logging.info("Job submission to AzureML done.")
     if self.azure_config.pytest_mark:
         # The AzureML job can optionally run pytest. Attempt to download it to the current directory.
         # A build step will pick up that file and publish it to Azure DevOps.
         # If pytest_mark is set, this file must exist.
         logging.info("Downloading pytest result file.")
         download_pytest_result(azure_run)
     else:
         logging.info(
             "No pytest_mark present, hence not downloading the pytest result file."
         )
     status = azure_run.get_status()
     # For PR builds where we wait for job completion, the job must have ended in a COMPLETED state.
     # If a pytest failed, the runner has exited with code -1 (see below)
     if self.azure_config.wait_for_completion and status != RunStatus.COMPLETED:
         logging.error(f"Job completed with status {status}. Exiting.")
         exit(-1)
     return azure_run
Exemplo n.º 2
0
        def after_submission_hook(azure_run: Run) -> None:
            """
            A function that will be called right after job submission.
            """
            # Set the default display name to what was provided as the "tag". This will affect single runs
            # and Hyperdrive parent runs
            if self.azure_config.tag:
                azure_run.display_name = self.azure_config.tag
            # Add an extra tag that depends on the run that was actually submitted. This is used for later filtering
            # run in cross validation analysis
            recovery_id = create_run_recovery_id(azure_run)
            azure_run.tag(RUN_RECOVERY_ID_KEY_NAME, recovery_id)
            print(
                "If this run fails, re-start runner.py and supply these additional arguments: "
                f"--run_recovery_id={recovery_id}")
            if self.azure_config.tensorboard:
                print(
                    "Starting TensorBoard now because you specified --tensorboard"
                )
                monitor(monitor_config=AMLTensorBoardMonitorConfig(
                    run_ids=[azure_run.id]),
                        azure_config=self.azure_config)
            else:
                print(
                    f"To monitor this run locally using TensorBoard, run the script: "
                    f"InnerEye/Azure/tensorboard_monitor.py --run_ids={azure_run.id}"
                )

            if self.azure_config.wait_for_completion:
                # We want the job output to be visible on the console. Do not exit yet if the job fails, because we
                # may need to download the pytest result file.
                azure_run.wait_for_completion(show_output=True,
                                              raise_on_error=False)
                if self.azure_config.pytest_mark:
                    # The AzureML job can optionally run pytest. Attempt to download it to the current directory.
                    # A build step will pick up that file and publish it to Azure DevOps.
                    # If pytest_mark is set, this file must exist.
                    logging.info("Downloading pytest result file.")
                    download_pytest_result(azure_run)
                if azure_run.status == RunStatus.FAILED:
                    raise ValueError(
                        f"The AzureML run failed. Please check this URL for details: "
                        f"{azure_run.get_portal_url()}")
Exemplo n.º 3
0
 def get_run_and_download_pytest(branch: str,
                                 number: int) -> Optional[Path]:
     experiment = Experiment(workspace,
                             name=to_azure_friendly_string(branch))
     runs = [run for run in experiment.get_runs() if run.number == number]
     if len(runs) != 1:
         raise ValueError(
             f"Expected to get exactly 1 run in experiment {experiment.name}"
         )
     return download_pytest_result(runs[0], output_dir)
Exemplo n.º 4
0
 def submit_to_azureml(self) -> Run:
     """
     Submit a job to AzureML, returning the resulting Run object, or exiting if we were asked to wait for
     completion and the Run did not succeed.
     """
     # The adal package creates a logging.info line each time it gets an authentication token, avoid that.
     logging.getLogger('adal-python').setLevel(logging.WARNING)
     # Azure core prints full HTTP requests even in INFO mode
     logging.getLogger('azure').setLevel(logging.WARNING)
     # PyJWT prints out warnings that are beyond our control
     warnings.filterwarnings("ignore", category=DeprecationWarning)
     if isinstance(self.model_config, DeepLearningConfig) and not self.lightning_container.azure_dataset_id:
         raise ValueError("When running an InnerEye built-in model in AzureML, the 'azure_dataset_id' "
                          "property must be set.")
     hyperdrive_func = lambda run_config: self.model_config.get_hyperdrive_config(run_config)  # type: ignore
     source_config = SourceConfig(
         root_folder=self.project_root,
         entry_script=Path(sys.argv[0]).resolve(),
         conda_dependencies_files=get_all_environment_files(self.project_root),
         hyperdrive_config_func=hyperdrive_func,
         # For large jobs, upload of results can time out because of large checkpoint files. Default is 600
         upload_timeout_seconds=86400,
     )
     source_config.set_script_params_except_submit_flag()
     azure_run = submit_to_azureml(self.azure_config, source_config,
                                   self.lightning_container.all_azure_dataset_ids(),
                                   self.lightning_container.all_dataset_mountpoints())
     logging.info("Job submission to AzureML done.")
     if self.azure_config.pytest_mark and self.azure_config.wait_for_completion:
         # The AzureML job can optionally run pytest. Attempt to download it to the current directory.
         # A build step will pick up that file and publish it to Azure DevOps.
         # If pytest_mark is set, this file must exist.
         logging.info("Downloading pytest result file.")
         download_pytest_result(azure_run)
     else:
         logging.info("No pytest_mark present, hence not downloading the pytest result file.")
     # For PR builds where we wait for job completion, the job must have ended in a COMPLETED state.
     if self.azure_config.wait_for_completion and not is_run_and_child_runs_completed(azure_run):
         raise ValueError(f"Run {azure_run.id} in experiment {azure_run.experiment.name} or one of its child "
                          "runs failed.")
     return azure_run