Пример #1
0
def get_or_create_python_environment(azure_config: AzureConfig,
                                     source_config: SourceConfig,
                                     environment_name: str = "",
                                     register_environment: bool = True) -> Environment:
    """
    Creates a description for the Python execution environment in AzureML, based on the Conda environment
    definition files that are specified in `source_config`. If such environment with this Conda environment already
    exists, it is retrieved, otherwise created afresh.
    :param azure_config: azure related configurations to use for model scale-out behaviour
    :param source_config: configurations for model execution, such as name and execution mode
    :param environment_name: If specified, try to retrieve the existing Python environment with this name. If that
    is not found, create one from the Conda files provided. This parameter is meant to be used when running
    inference for an existing model.
    :param register_environment: If True, the Python environment will be registered in the AzureML workspace. If
    False, it will only be created, but not registered. Use this for unit testing.
    """
    # Merge the project-specific dependencies with the packages that InnerEye itself needs. This should not be
    # necessary if the innereye package is installed. It is necessary when working with an outer project and
    # InnerEye as a git submodule and submitting jobs from the local machine.
    # In case of version conflicts, the package version in the outer project is given priority.
    conda_dependencies, merged_yaml = merge_conda_dependencies(source_config.conda_dependencies_files)  # type: ignore
    if azure_config.pip_extra_index_url:
        # When an extra-index-url is supplied, swap the order in which packages are searched for.
        # This is necessary if we need to consume packages from extra-index that clash with names of packages on
        # pypi
        conda_dependencies.set_pip_option(f"--index-url {azure_config.pip_extra_index_url}")
        conda_dependencies.set_pip_option("--extra-index-url https://pypi.org/simple")
    env_variables = {
        "AZUREML_OUTPUT_UPLOAD_TIMEOUT_SEC": str(source_config.upload_timeout_seconds),
        "MKL_SERVICE_FORCE_INTEL": "1",
        **(source_config.environment_variables or {})
    }
    base_image = "mcr.microsoft.com/azureml/openmpi3.1.2-cuda10.2-cudnn8-ubuntu18.04"
    # Create a name for the environment that will likely uniquely identify it. AzureML does hashing on top of that,
    # and will re-use existing environments even if they don't have the same name.
    # Hashing should include everything that can reasonably change. Rely on hashlib here, because the built-in
    # hash function gives different results for the same string in different python instances.
    hash_string = "\n".join([merged_yaml, azure_config.docker_shm_size, base_image, str(env_variables)])
    sha1 = hashlib.sha1(hash_string.encode("utf8"))
    overall_hash = sha1.hexdigest()[:32]
    unique_env_name = f"InnerEye-{overall_hash}"
    try:
        env_name_to_find = environment_name or unique_env_name
        env = Environment.get(azure_config.get_workspace(), name=env_name_to_find, version=ENVIRONMENT_VERSION)
        logging.info(f"Using existing Python environment '{env.name}'.")
        return env
    except Exception:
        logging.info(f"Python environment '{unique_env_name}' does not yet exist, creating and registering it.")
    env = Environment(name=unique_env_name)
    env.docker.enabled = True
    env.docker.shm_size = azure_config.docker_shm_size
    env.python.conda_dependencies = conda_dependencies
    env.docker.base_image = base_image
    env.environment_variables = env_variables
    if register_environment:
        env.register(azure_config.get_workspace())
    return env
    def download_checkpoints_from_recovery_run(azure_config: AzureConfig,
                                               config: DeepLearningConfig,
                                               run_context: Optional[Run] = None) -> RunRecovery:
        """
        Downloads checkpoints of run corresponding to the run_recovery_id in azure_config, and any
        checkpoints of the child runs if they exist.

        :param azure_config: Azure related configs.
        :param config: Model related configs.
        :param run_context: Context of the current run (will be used to find the target AML workspace)
        :return:RunRecovery
        """
        run_context = run_context or RUN_CONTEXT
        workspace = azure_config.get_workspace()

        # Find the run to recover in AML workspace
        if not azure_config.run_recovery_id:
            raise ValueError("A valid run_recovery_id is required to download recovery checkpoints, found None")

        run_to_recover = fetch_run(workspace, azure_config.run_recovery_id.strip())
        # Handle recovery of a HyperDrive cross validation run (from within a successor HyperDrive run,
        # not in ensemble creation). In this case, run_recovery_id refers to the parent prior run, so we
        # need to set run_to_recover to the child of that run whose split index is the same as that of
        # the current (child) run.
        if is_cross_validation_child_run(run_context):
            run_to_recover = next(x for x in fetch_child_runs(run_to_recover) if
                                  get_cross_validation_split_index(x) == get_cross_validation_split_index(run_context))

        return RunRecovery.download_checkpoints_from_run(config, run_to_recover)
def get_comparison_baselines(outputs_folder: Path, azure_config: AzureConfig,
                             comparison_blob_storage_paths: List[Tuple[str, str]]) -> \
        List[ComparisonBaseline]:
    workspace = azure_config.get_workspace()
    comparison_baselines = []
    for (comparison_name, comparison_path) in comparison_blob_storage_paths:
        # Discard the experiment part of the run rec ID, if any.
        comparison_path = comparison_path.split(":")[-1]
        run_rec_id, blob_path_str = comparison_path.split("/", 1)
        run_rec_id = strip_prefix(run_rec_id, AZUREML_RUN_FOLDER_PREFIX)
        blob_path = Path(
            strip_prefix(blob_path_str, DEFAULT_AML_UPLOAD_DIR + "/"))
        run = fetch_run(workspace, run_rec_id)
        (comparison_dataset_path,
         comparison_metrics_path) = get_comparison_baseline_paths(
             outputs_folder, blob_path, run, DATASET_CSV_FILE_NAME)
        # If both dataset.csv and metrics.csv were downloaded successfully, read their contents and
        # add a tuple to the comparison data.
        if comparison_dataset_path is not None and comparison_metrics_path is not None and \
                comparison_dataset_path.exists() and comparison_metrics_path.exists():
            comparison_baselines.append(
                ComparisonBaseline(comparison_name,
                                   pd.read_csv(comparison_dataset_path),
                                   pd.read_csv(comparison_metrics_path),
                                   run_rec_id))
        else:
            raise ValueError(
                f"could not find comparison data for run {run_rec_id}")
    return comparison_baselines
Пример #4
0
def test_download_checkpoints(test_output_dirs: OutputFolderForTests, is_ensemble: bool,
                              runner_config: AzureConfig) -> None:
    output_dir = test_output_dirs.root_dir
    assert get_results_blob_path("some_run_id") == "azureml/ExperimentRun/dcid.some_run_id"
    # Any recent run ID from a PR build will do. Use a PR build because the checkpoint files are small there.
    config = ModelConfigBase(should_validate=False)
    config.set_output_to(output_dir)

    runner_config.run_recovery_id = DEFAULT_ENSEMBLE_RUN_RECOVERY_ID if is_ensemble else DEFAULT_RUN_RECOVERY_ID
    run_recovery = RunRecovery.download_checkpoints_from_recovery_run(runner_config, config)
    run_to_recover = fetch_run(workspace=runner_config.get_workspace(), run_recovery_id=runner_config.run_recovery_id)
    expected_checkpoint_file = "1" + CHECKPOINT_FILE_SUFFIX
    if is_ensemble:
        child_runs = fetch_child_runs(run_to_recover)
        expected_files = [config.checkpoint_folder
                          / OTHER_RUNS_SUBDIR_NAME
                          / str(x.get_tags()['cross_validation_split_index']) / expected_checkpoint_file
                          for x in child_runs]
    else:
        expected_files = [config.checkpoint_folder / run_to_recover.id / expected_checkpoint_file]

    checkpoint_paths = run_recovery.get_checkpoint_paths(1)
    if is_ensemble:
        assert len(run_recovery.checkpoints_roots) == len(expected_files)
        assert all([(x in [y.parent for y in expected_files]) for x in run_recovery.checkpoints_roots])
        assert len(checkpoint_paths) == len(expected_files)
        assert all([x in expected_files for x in checkpoint_paths])
    else:
        assert len(checkpoint_paths) == 1
        assert checkpoint_paths[0] == expected_files[0]

    assert all([expected_file.exists() for expected_file in expected_files])
Пример #5
0
def create_run_config(azure_config: AzureConfig,
                      source_config: SourceConfig,
                      all_azure_dataset_ids: List[str],
                      all_dataset_mountpoints: List[str],
                      environment_name: str = "") -> ScriptRunConfig:
    """
    Creates a configuration to run the InnerEye training script in AzureML.
    :param azure_config: azure related configurations to use for model scale-out behaviour
    :param source_config: configurations for model execution, such as name and execution mode
    :param all_azure_dataset_ids: The name of all datasets on blob storage that will be used for this run.
    :param all_dataset_mountpoints: When using the datasets in AzureML, these are the per-dataset mount points.
    :param environment_name: If specified, try to retrieve the existing Python environment with this name. If that
    is not found, create one from the Conda files provided in `source_config`. This parameter is meant to be used
    when running inference for an existing model.
    :return: The configured script run.
    """
    dataset_consumptions = create_dataset_consumptions(
        azure_config, all_azure_dataset_ids, all_dataset_mountpoints)
    # AzureML seems to sometimes expect the entry script path in Linux format, hence convert to posix path
    entry_script_relative_path = source_config.entry_script.relative_to(
        source_config.root_folder).as_posix()
    logging.info(
        f"Entry script {entry_script_relative_path} ({source_config.entry_script} relative to "
        f"source directory {source_config.root_folder})")
    max_run_duration = None
    if azure_config.max_run_duration:
        max_run_duration = run_duration_string_to_seconds(
            azure_config.max_run_duration)
    workspace = azure_config.get_workspace()
    run_config = RunConfiguration(
        script=entry_script_relative_path,
        arguments=source_config.script_params,
    )
    run_config.environment = get_or_create_python_environment(
        azure_config, source_config, environment_name=environment_name)
    run_config.target = azure_config.cluster
    run_config.max_run_duration_seconds = max_run_duration
    if azure_config.num_nodes > 1:
        distributed_job_config = MpiConfiguration(
            node_count=azure_config.num_nodes)
        run_config.mpi = distributed_job_config
        run_config.framework = "Python"
        run_config.communicator = "IntelMpi"
        run_config.node_count = distributed_job_config.node_count
    if len(dataset_consumptions) > 0:
        run_config.data = {
            dataset.name: dataset
            for dataset in dataset_consumptions
        }
    # Use blob storage for storing the source, rather than the FileShares section of the storage account.
    run_config.source_directory_data_store = workspace.datastores.get(
        WORKSPACE_DEFAULT_BLOB_STORE_NAME).name
    script_run_config = ScriptRunConfig(
        source_directory=str(source_config.root_folder),
        run_config=run_config,
    )
    if azure_config.hyperdrive:
        script_run_config = source_config.hyperdrive_config_func(
            script_run_config)  # type: ignore
    return script_run_config
def submit_for_inference(args: SubmitForInferenceConfig,
                         azure_config: AzureConfig) -> Optional[Path]:
    """
    Create and submit an inference to AzureML, and optionally download the resulting segmentation.
    :param azure_config: An object with all necessary information for accessing Azure.
    :param args: configuration, see SubmitForInferenceConfig
    :return: path to downloaded segmentation on local disc, or None if none.
    """
    logging.info(f"Building Azure configuration from {args.settings}")
    logging.info("Getting workspace")
    workspace = azure_config.get_workspace()
    logging.info("Identifying model")
    model = Model(workspace=workspace, id=args.model_id)
    model_id = model.id
    logging.info(f"Identified model {model_id}")
    source_directory = tempfile.TemporaryDirectory()
    source_directory_name = source_directory.name
    logging.info(
        f"Building inference run submission in {source_directory_name}")
    source_directory_path = Path(source_directory_name)
    copy_image_file(args.image_file,
                    source_directory_path / DEFAULT_DATA_FOLDER)
    # We copy over run_scoring.py, and score.py as well in case the model we're using
    # does not have sufficiently recent versions of those files.
    for base in ["run_scoring.py", "score.py"]:
        shutil.copyfile(base, str(source_directory_path / base))
    source_config = SourceConfig(
        root_folder=source_directory_name,
        entry_script=str(source_directory_path / "run_scoring.py"),
        script_params={
            "--data-folder": ".",
            "--spawnprocess": "python",
            "--model-id": model_id,
            "score.py": ""
        },
        conda_dependencies_files=download_conda_dependency_files(
            model, source_directory_path))
    estimator = create_estimator_from_configs(workspace, azure_config,
                                              source_config, [])
    exp = Experiment(workspace=workspace, name=args.experiment_name)
    run = exp.submit(estimator)
    logging.info(f"Submitted run {run.id} in experiment {run.experiment.name}")
    logging.info(f"Run URL: {run.get_portal_url()}")
    if not args.keep_upload_folder:
        source_directory.cleanup()
        logging.info(f"Deleted submission directory {source_directory_name}")
    if args.download_folder is None:
        return None
    logging.info("Awaiting run completion")
    run.wait_for_completion()
    logging.info(f"Run has completed with status {run.get_status()}")
    download_path = choose_download_path(args.download_folder)
    logging.info(f"Attempting to download segmentation to {download_path}")
    run.download_file(DEFAULT_RESULT_IMAGE_NAME, str(download_path))
    if download_path.exists():
        logging.info(f"Downloaded segmentation to {download_path}")
    else:
        logging.warning("Segmentation NOT downloaded")
    return download_path
Пример #7
0
def create_run_config(azure_config: AzureConfig,
                      source_config: SourceConfig,
                      azure_dataset_id: str = "",
                      environment_name: str = "") -> ScriptRunConfig:
    """
    Creates a configuration to run the InnerEye training script in AzureML.
    :param azure_config: azure related configurations to use for model scale-out behaviour
    :param source_config: configurations for model execution, such as name and execution mode
    :param azure_dataset_id: The name of the dataset in blob storage to be used for this run. This can be an empty
    string to not use any datasets.
    :param environment_name: If specified, try to retrieve the existing Python environment with this name. If that
    is not found, create one from the Conda files provided in `source_config`. This parameter is meant to be used
    when running inference for an existing model.
    :return: The configured script run.
    """
    if azure_dataset_id:
        azureml_dataset = get_or_create_dataset(azure_config, azure_dataset_id=azure_dataset_id)
        if not azureml_dataset:
            raise ValueError(f"AzureML dataset {azure_dataset_id} could not be found or created.")
        named_input = azureml_dataset.as_named_input(INPUT_DATA_KEY)
        dataset_consumption = named_input.as_mount() if azure_config.use_dataset_mount else named_input.as_download()
    else:
        dataset_consumption = None
    # AzureML seems to sometimes expect the entry script path in Linux format, hence convert to posix path
    entry_script_relative_path = source_config.entry_script.relative_to(source_config.root_folder).as_posix()
    logging.info(f"Entry script {entry_script_relative_path} ({source_config.entry_script} relative to "
                 f"source directory {source_config.root_folder})")
    max_run_duration = None
    if azure_config.max_run_duration:
        max_run_duration = run_duration_string_to_seconds(azure_config.max_run_duration)
    workspace = azure_config.get_workspace()
    run_config = RunConfiguration(
        script=entry_script_relative_path,
        arguments=source_config.script_params,
    )
    run_config.environment = get_or_create_python_environment(azure_config, source_config,
                                                              environment_name=environment_name)
    run_config.target = azure_config.cluster
    run_config.max_run_duration_seconds = max_run_duration
    if azure_config.num_nodes > 1:
        distributed_job_config = MpiConfiguration(node_count=azure_config.num_nodes)
        run_config.mpi = distributed_job_config
        run_config.framework = "Python"
        run_config.communicator = "IntelMpi"
        run_config.node_count = distributed_job_config.node_count
    if dataset_consumption:
        run_config.data = {dataset_consumption.name: dataset_consumption}
    # Use blob storage for storing the source, rather than the FileShares section of the storage account.
    run_config.source_directory_data_store = workspace.datastores.get(WORKSPACE_DEFAULT_BLOB_STORE_NAME).name
    script_run_config = ScriptRunConfig(
        source_directory=str(source_config.root_folder),
        run_config=run_config,
    )
    if azure_config.hyperdrive:
        script_run_config = source_config.hyperdrive_config_func(script_run_config)  # type: ignore
    return script_run_config
Пример #8
0
def get_comparison_baselines(outputs_folder: Path, azure_config: AzureConfig,
                             comparison_blob_storage_paths: List[Tuple[str, str]]) -> \
        List[ComparisonBaseline]:
    workspace = azure_config.get_workspace()
    comparison_baselines = []
    for (comparison_name, comparison_path) in comparison_blob_storage_paths:
        # Discard the experiment part of the run rec ID, if any.
        comparison_path = comparison_path.split(":")[-1]
        run_rec_id, blob_path_str = comparison_path.split("/", 1)
        run_rec_id = strip_prefix(run_rec_id, AZUREML_RUN_FOLDER_PREFIX)
        blob_path = Path(strip_prefix(blob_path_str, DEFAULT_AML_UPLOAD_DIR + "/"))
        run = fetch_run(workspace, run_rec_id)
        # We usually find dataset.csv in the same directory as metrics.csv, but we sometimes
        # have to look higher up.
        comparison_dataset_path: Optional[Path] = None
        comparison_metrics_path: Optional[Path] = None
        destination_folder = outputs_folder / run_rec_id / blob_path
        # Look for dataset.csv inside epoch_NNN/Test, epoch_NNN/ and at top level
        for blob_path_parent in step_up_directories(blob_path):
            try:
                comparison_dataset_path = download_outputs_from_run(
                    blob_path_parent / DATASET_CSV_FILE_NAME, destination_folder, run, True)
                break
            except ValueError:
                logging.warning(f"cannot find {DATASET_CSV_FILE_NAME} at {blob_path_parent} in {run_rec_id}")
                pass
            except NotADirectoryError:
                logging.warning(f"{blob_path_parent} is not a directory")
                break
            if comparison_dataset_path is None:
                logging.warning(f"cannot find {DATASET_CSV_FILE_NAME} at or above {blob_path} in {run_rec_id}")
        # Look for epoch_NNN/Test/metrics.csv
        try:
            comparison_metrics_path = download_outputs_from_run(
                blob_path / METRICS_FILE_NAME, destination_folder, run, True)
        except ValueError:
            logging.warning(f"cannot find {METRICS_FILE_NAME} at {blob_path} in {run_rec_id}")
        # If both dataset.csv and metrics.csv were downloaded successfully, read their contents and
        # add a tuple to the comparison data.
        if comparison_dataset_path is not None and comparison_metrics_path is not None and \
                comparison_dataset_path.exists() and comparison_metrics_path.exists():
            comparison_baselines.append(ComparisonBaseline(
                comparison_name,
                pd.read_csv(comparison_dataset_path),
                pd.read_csv(comparison_metrics_path),
                run_rec_id))
        else:
            logging.warning(f"could not find comparison data for run {run_rec_id}")
            for key, path in ("dataset", comparison_dataset_path), ("metrics", comparison_metrics_path):
                logging.warning(f"path to {key} data is {path}")
                # noinspection PyUnresolvedReferences
                if path is not None and not path.exists():
                    logging.warning("    ... but it does not exist")
    return comparison_baselines
def monitor(monitor_config: AMLTensorBoardMonitorConfig,
            azure_config: AzureConfig) -> None:
    """
    Starts TensorBoard monitoring as per the provided arguments.
    :param monitor_config: The config containing information on which runs that need be monitored.
    :param azure_config: An AzureConfig object with secrets/keys to access the workspace.
    """
    # Fetch AzureML workspace and the experiment runs in it
    workspace = azure_config.get_workspace()

    if monitor_config.run_ids is not None:
        if len(monitor_config.run_ids) == 0:
            print("At least one run_recovery_id must be given for monitoring.")
            sys.exit(1)
        exp_runs = [
            azure_util.fetch_run(workspace, run_id)
            for run_id in monitor_config.run_ids
        ]
    else:
        if monitor_config.experiment_name not in workspace.experiments:
            print(f"The experiment: {monitor_config.experiment_name} doesn't "
                  f"exist in the {monitor_config.workspace_name} workspace.")
            sys.exit(1)

        experiment = Experiment(workspace, monitor_config.experiment_name)
        filters = common_util.get_items_from_string(
            monitor_config.run_status) if monitor_config.run_status else []

        exp_runs = azure_util.fetch_runs(experiment, filters)

        if len(exp_runs) == 0:
            _msg = "No runs to monitor"
            if monitor_config.run_status:
                _msg += f"with status [{monitor_config.run_status}]."
            print(_msg)
            sys.exit(1)

    # Start TensorBoard on executing machine
    ts = Tensorboard(exp_runs,
                     local_root=str(monitor_config.local_root),
                     port=monitor_config.port)

    print(
        "=============================================================================="
    )
    for run in exp_runs:
        print(f"Run URL: {run.get_portal_url()}")
    print("TensorBoard URL: ")
    ts.start()
    print(
        "==============================================================================\n\n"
    )
    input("Press Enter to close TensorBoard...")
    ts.stop()
def get_first_child_run(azure_config: AzureConfig) -> Run:
    """
    Download first child run in order to download data
    :param azure_config:
    :return: first child run
    """
    if not azure_config.run_recovery_id:
        raise ValueError("azure_config.run_recovery_id is not provided.")
    workspace = azure_config.get_workspace()
    hyperdrive_run = fetch_run(workspace, azure_config.run_recovery_id)
    child_runs = fetch_child_runs(hyperdrive_run, status=RunStatus.COMPLETED)
    return child_runs[0]
Пример #11
0
def create_and_submit_experiment(
        azure_config: AzureConfig,
        source_config: SourceConfig,
        model_config_overrides: str,
        azure_dataset_id: str) -> Run:
    """
    Creates an AzureML experiment in the workspace and submits it for execution.
    :param azure_config: azure related configurations to setup valid workspace
    :param source_config: The information about which code should be submitted, and which arguments should be used.
    :param model_config_overrides: A string that describes which model parameters were overwritten by commandline
     arguments in the present run. This is only used for diagnostic purposes (it is set as a Tag on the run).
    :param azure_dataset_id: The name of the dataset in blob storage to be used for this run.
    :returns: Run object for the submitted AzureML run
    """
    workspace = azure_config.get_workspace()
    experiment_name = create_experiment_name(azure_config)
    exp = Experiment(workspace=workspace, name=azure_util.to_azure_friendly_string(experiment_name))
    script_run_config = create_run_config(azure_config, source_config, azure_dataset_id)

    # submit a training/testing run associated with the experiment
    run: Run = exp.submit(script_run_config)

    # set metadata for the run
    set_run_tags(run, azure_config, model_config_overrides)

    print("\n==============================================================================")
    print(f"Successfully queued new run {run.id} in experiment: {exp.name}")

    if azure_config.run_recovery_id:
        print(f"\nRecovered from: {azure_config.run_recovery_id}")

    recovery_id = azure_util.create_run_recovery_id(run)
    recovery_file = Path(RUN_RECOVERY_FILE)
    if recovery_file.exists():
        recovery_file.unlink()
    recovery_file.write_text(recovery_id)

    print("Experiment URL: {}".format(exp.get_portal_url()))
    print("Run URL: {}".format(run.get_portal_url()))
    print("If this run fails, re-start runner.py and supply these additional arguments: "
          f"--run_recovery_id={recovery_id}")
    print(f"The run recovery ID has been written to this file: {recovery_file}")
    print("==============================================================================")
    if azure_config.tensorboard and azure_config.azureml:
        print("Starting TensorBoard now because you specified --tensorboard")
        monitor(monitor_config=AMLTensorBoardMonitorConfig(run_ids=[run.id]), azure_config=azure_config)
    else:
        print(f"To monitor this run locally using TensorBoard, run the script: "
              f"InnerEye/Azure/tensorboard_monitor.py --run_ids={run.id}")
        print("==============================================================================")
    return run
Пример #12
0
def test_download_checkpoints_hyperdrive_run(test_output_dirs: OutputFolderForTests,
                                             runner_config: AzureConfig) -> None:
    output_dir = test_output_dirs.root_dir
    config = ModelConfigBase(should_validate=False)
    config.set_output_to(output_dir)
    runner_config.run_recovery_id = DEFAULT_ENSEMBLE_RUN_RECOVERY_ID
    child_runs = fetch_child_runs(run=fetch_run(runner_config.get_workspace(), DEFAULT_ENSEMBLE_RUN_RECOVERY_ID))
    # recover child runs separately also to test hyperdrive child run recovery functionality
    expected_checkpoint_file = "1" + CHECKPOINT_FILE_SUFFIX
    for child in child_runs:
        expected_files = [config.checkpoint_folder / child.id / expected_checkpoint_file]
        run_recovery = RunRecovery.download_checkpoints_from_recovery_run(runner_config, config, child)
        assert all([x in expected_files for x in run_recovery.get_checkpoint_paths(epoch=1)])
        assert all([expected_file.exists() for expected_file in expected_files])
Пример #13
0
def download_dataset(azure_dataset_id: str, target_folder: Path,
                     azure_config: AzureConfig) -> Path:
    """
    Downloads or checks for an existing dataset on the executing machine. If a local_dataset is supplied and the
    directory is present, return that. Otherwise, download the dataset specified by the azure_dataset_id from the
    AzureML dataset attached to the given AzureML workspace. The dataset is downloaded into the `target_folder`,
    in a subfolder that has the same name as the dataset. If there already appears to be such a folder, and the folder
    contains a dataset.csv file, no download is started.
    :param local_dataset: The path to an existing local dataset.
    :param azure_dataset_id: The name of a dataset that is registered in the AzureML workspace.
    :param target_folder: The folder in which to download the dataset from Azure.
    :param azure_config: All Azure-related configuration options.
    :return: A path on the local machine that contains the dataset.
    """
    workspace = azure_config.get_workspace()
    try:
        downloaded_via_blobxfer = download_dataset_via_blobxfer(
            dataset_id=azure_dataset_id,
            azure_config=azure_config,
            target_folder=target_folder)
        if downloaded_via_blobxfer:
            return downloaded_via_blobxfer
    except Exception as ex:
        print_exception(ex, message="Unable to download dataset via blobxfer.")
    logging.info("Trying to download dataset via AzureML datastore now.")
    azure_dataset = get_or_create_dataset(workspace, azure_dataset_id)
    if not isinstance(azure_dataset, FileDataset):
        raise ValueError(
            f"Expected to get a FileDataset, but got {type(azure_dataset)}")
    # The downloaded dataset may already exist from a previous run.
    expected_dataset_path = target_folder / azure_dataset_id
    expected_dataset_file = expected_dataset_path / DATASET_CSV_FILE_NAME
    logging.info(
        f"Model training will use dataset '{azure_dataset_id}' in Azure.")
    if expected_dataset_path.is_dir() and expected_dataset_file.is_file():
        logging.info(
            f"The dataset appears to be downloaded already in {expected_dataset_path}. Skipping."
        )
        return expected_dataset_path
    logging.info(
        "Starting to download the dataset - WARNING, this could take very long!"
    )
    with logging_section("Downloading dataset"):
        azure_dataset.download(target_path=str(expected_dataset_path),
                               overwrite=False)
    logging.info(
        f"Azure dataset '{azure_dataset_id}' is now available in {expected_dataset_path}"
    )
    return expected_dataset_path
Пример #14
0
def get_or_create_dataset(azure_config: AzureConfig,
                          azure_dataset_id: str) -> Dataset:
    """
    Looks in the AzureML datastore for a dataset of the given name. If there is no such dataset, a dataset is created
    and registered, assuming that the files are in a folder that has the same name as the dataset. For example, if
    azure_dataset_id is 'foo', then the 'foo' dataset is pointing to <container_root>/datasets/foo folder.

    WARNING: the behaviour of Dataset.File.from_files, used below, is idiosyncratic. For example,
    if "mydataset" storage has two "foo..." subdirectories each containing
    a file dataset.csv and a directory ABC,

    datastore = Datastore.get(workspace, "mydataset")
    # This dataset has the file(s) in foo-bar01 at top level, e.g. dataset.csv
    ds1 = Dataset.File.from_files([(datastore, "foo-bar01/*")])
    # This dataset has two directories at top level, each with a name matching foo-bar*, and each
    # containing dataset.csv.
    ds2 = Dataset.File.from_files([(datastore, "foo-bar*/*")])
    # This dataset contains a single directory "mydataset" at top level, containing a subdirectory
    # foo-bar01, containing dataset.csv and (part of) ABC.
    ds3 = Dataset.File.from_files([(datastore, "foo-bar01/*"),
                                   (datastore, "foo-bar01/ABC/abc_files/*/*.nii.gz")])

    These behaviours can be verified by calling "ds.download()" on each dataset ds.
    """
    if not azure_config.azureml_datastore:
        raise ValueError(
            "No value set for 'azureml_datastore' (name of the datastore in the AzureML workspace)"
        )
    logging.info(
        f"Retrieving datastore '{azure_config.azureml_datastore}' from AzureML workspace"
    )
    workspace = azure_config.get_workspace()
    datastore = Datastore.get(workspace, azure_config.azureml_datastore)
    try:
        logging.info(
            f"Trying to retrieve AzureML Dataset '{azure_dataset_id}'")
        azureml_dataset = Dataset.get_by_name(workspace, name=azure_dataset_id)
        logging.info("Dataset found.")
    except:
        logging.info(
            f"Dataset does not yet exist, creating a new one from data in folder '{azure_dataset_id}'"
        )
        # See WARNING above before changing the from_files call!
        azureml_dataset = Dataset.File.from_files([(datastore,
                                                    azure_dataset_id)])
        logging.info("Registering the dataset for future use.")
        azureml_dataset.register(workspace, name=azure_dataset_id)
    return azureml_dataset
def submit_to_azureml(azure_config: AzureConfig, source_config: SourceConfig,
                      model_config_overrides: str,
                      azure_dataset_id: str) -> Run:
    """
    The main entry point. It creates an AzureML workspace if needed, submits an experiment using the code
    as specified in source_config, and waits for completion if needed.
    :param azure_config: azure related configurations to setup valid workspace
    :param source_config: The information about which code should be submitted, and which arguments should be used.
    :param model_config_overrides: A string that describes which model parameters were overwritten by commandline
     arguments in the present run. This is only used for diagnostic purposes (it is set as a Tag on the run).
    :param azure_dataset_id: The name of the dataset on blob storage to be used for this run.
    """
    azure_run: Optional[Run] = None

    # When running as part of the PR build, jobs frequently get interrupted by new pushes to the repository.
    # In this case, we'd like to cancel the current AzureML run before exiting, to reduce cost.
    # However, at present, this does NOT work, the SIGINT is not propagated through.
    def interrupt_handler(signal: int, _: Any) -> None:
        logging.info('Process interrupted via signal {}'.format(str(signal)))
        if azure_run:
            logging.info('Trying to terminate the AzureML job now.')
            azure_run.cancel()
        sys.exit(0)

    for s in [signal.SIGINT, signal.SIGTERM]:
        signal.signal(s, interrupt_handler)

    # Retrieve the AzureML workspace
    workspace = azure_config.get_workspace()

    # create train/test experiment
    azure_run = create_and_submit_experiment(workspace, azure_config,
                                             source_config,
                                             model_config_overrides,
                                             azure_dataset_id)

    if azure_config.wait_for_completion:
        # We want the job output to be visible on the console, but the program should not exit if the
        # job fails because we need to download the pytest result file.
        azure_run.wait_for_completion(show_output=True, raise_on_error=False)

    return azure_run
Пример #16
0
def submit_for_inference(args: SubmitForInferenceConfig,
                         azure_config: AzureConfig) -> Optional[Path]:
    """
    Create and submit an inference to AzureML, and optionally download the resulting segmentation.
    :param azure_config: An object with all necessary information for accessing Azure.
    :param args: configuration, see SubmitForInferenceConfig
    :return: path to downloaded segmentation on local disc, or None if none.
    """
    logging.info(f"Building Azure configuration from {args.settings}")
    logging.info("Getting workspace")
    workspace = azure_config.get_workspace()
    logging.info("Identifying model")
    model = Model(workspace=workspace, id=args.model_id)
    model_id = model.id
    logging.info(f"Identified model {model_id}")
    source_directory = tempfile.TemporaryDirectory()
    source_directory_path = Path(source_directory.name)
    logging.info(
        f"Building inference run submission in {source_directory_path}")
    image_folder = source_directory_path / DEFAULT_DATA_FOLDER
    image = copy_image_file(args.image_file, image_folder, args.use_dicom)
    # Retrieve the name of the Python environment that the training run used. This environment should have been
    # registered at training.
    python_environment_name = model.tags.get(PYTHON_ENVIRONMENT_NAME, "")
    if not python_environment_name:
        raise ValueError(
            f"The model did not contain tag {PYTHON_ENVIRONMENT_NAME} for the AzureML environment to use."
        )
    # Copy the scoring script from the repository. This will start the model download from Azure, and invoke the
    # scoring script.
    entry_script = source_directory_path / Path(RUN_SCORING_SCRIPT).name
    shutil.copyfile(str(repository_root_directory(RUN_SCORING_SCRIPT)),
                    str(entry_script))
    run_config = create_run_configuration(
        workspace=azure_config.get_workspace(),
        compute_cluster_name=azure_config.cluster,
        aml_environment_name=python_environment_name)
    script_run_config = ScriptRunConfig(
        source_directory=str(source_directory_path),
        script=entry_script.relative_to(source_directory_path),
        arguments=[
            "--model-folder",
            ".",
            "--model-id",
            model_id,
            SCORE_SCRIPT,
            # The data folder must be relative to the root folder of the AzureML
            # job. image_files is then just the file relative to the data_folder
            "--data_folder",
            image.parent.name,
            "--image_files",
            image.name,
            "--use_dicom",
            str(args.use_dicom),
            "--model_id",
            model_id
        ],
        run_config=run_config)

    run = submit_run(workspace=workspace,
                     experiment_name=args.experiment_name,
                     script_run_config=script_run_config,
                     wait_for_completion=True)
    if not args.keep_upload_folder:
        source_directory.cleanup()
        logging.info(f"Deleted submission directory {source_directory_path}")
    if args.download_folder is None:
        return None
    logging.info(f"Run has completed with status {run.get_status()}")
    download_file = DEFAULT_RESULT_ZIP_DICOM_NAME if args.use_dicom else DEFAULT_RESULT_IMAGE_NAME
    download_path = choose_download_path(download_file, args.download_folder)
    logging.info(f"Attempting to download segmentation to {download_path}")
    run.download_file(download_file, str(download_path))
    if download_path.exists():
        logging.info(f"Downloaded segmentation to {download_path}")
    else:
        logging.warning("Segmentation NOT downloaded")
    return download_path
def create_estimator_from_configs(
        azure_config: AzureConfig, source_config: SourceConfig,
        estimator_inputs: List[DatasetConsumptionConfig]) -> PyTorch:
    """
    Create an return a PyTorch estimator from the provided configuration information.
    :param azure_config: Azure configuration, used to store various values for the job to be submitted
    :param source_config: source configutation, for other needed values
    :param estimator_inputs: value for the "inputs" field of the estimator.
    :return:
    """
    # AzureML seems to sometimes expect the entry script path in Linux format, hence convert to posix path
    entry_script_relative_path = Path(source_config.entry_script).relative_to(
        source_config.root_folder).as_posix()
    logging.info(
        f"Entry script {entry_script_relative_path} ({source_config.entry_script} relative to "
        f"source directory {source_config.root_folder})")
    environment_variables = {
        "AZUREML_OUTPUT_UPLOAD_TIMEOUT_SEC":
        str(source_config.upload_timeout_seconds),
        "MKL_SERVICE_FORCE_INTEL":
        "1",
        **(source_config.environment_variables or {})
    }
    # Merge the project-specific dependencies with the packages that InnerEye itself needs. This should not be
    # necessary if the innereye package is installed. It is necessary when working with an outer project and
    # InnerEye as a git submodule and submitting jobs from the local machine.
    # In case of version conflicts, the package version in the outer project is given priority.
    conda_dependencies = merge_conda_dependencies(
        source_config.conda_dependencies_files)  # type: ignore
    if azure_config.pip_extra_index_url:
        # When an extra-index-url is supplied, swap the order in which packages are searched for.
        # This is necessary if we need to consume packages from extra-index that clash with names of packages on
        # pypi
        conda_dependencies.set_pip_option(
            f"--index-url {azure_config.pip_extra_index_url}")
        conda_dependencies.set_pip_option(
            "--extra-index-url https://pypi.org/simple")
    # create Estimator environment
    framework_version = pytorch_version_from_conda_dependencies(
        conda_dependencies)
    logging.info(f"PyTorch framework version: {framework_version}")
    max_run_duration = None
    if azure_config.max_run_duration:
        max_run_duration = run_duration_string_to_seconds(
            azure_config.max_run_duration)
    workspace = azure_config.get_workspace()
    estimator = PyTorch(
        source_directory=source_config.root_folder,
        entry_script=entry_script_relative_path,
        script_params=source_config.script_params,
        compute_target=azure_config.cluster,
        # Use blob storage for storing the source, rather than the FileShares section of the storage account.
        source_directory_data_store=workspace.datastores.get(
            WORKSPACE_DEFAULT_BLOB_STORE_NAME),
        inputs=estimator_inputs,
        environment_variables=environment_variables,
        shm_size=azure_config.docker_shm_size,
        use_docker=True,
        use_gpu=True,
        framework_version=framework_version,
        max_run_duration_seconds=max_run_duration)
    estimator.run_config.environment.python.conda_dependencies = conda_dependencies
    # We'd like to log the estimator config, but conversion to string fails when the Estimator has some inputs.
    # logging.info(azure_util.estimator_to_string(estimator))
    if azure_config.hyperdrive:
        estimator = source_config.hyperdrive_config_func(
            estimator)  # type: ignore
    return estimator
def submit_for_inference(args: SubmitForInferenceConfig,
                         azure_config: AzureConfig) -> Optional[Path]:
    """
    Create and submit an inference to AzureML, and optionally download the resulting segmentation.
    :param azure_config: An object with all necessary information for accessing Azure.
    :param args: configuration, see SubmitForInferenceConfig
    :return: path to downloaded segmentation on local disc, or None if none.
    """
    logging.info(f"Building Azure configuration from {args.settings}")
    logging.info("Getting workspace")
    workspace = azure_config.get_workspace()
    logging.info("Identifying model")
    model = Model(workspace=workspace, id=args.model_id)
    model_id = model.id
    logging.info(f"Identified model {model_id}")
    source_directory = tempfile.TemporaryDirectory()
    source_directory_path = Path(source_directory.name)
    logging.info(
        f"Building inference run submission in {source_directory_path}")
    image_folder = source_directory_path / DEFAULT_DATA_FOLDER
    image = copy_image_file(args.image_file, image_folder, args.use_dicom)
    model_sas_urls = model.get_sas_urls()
    # Identifies all the files with basename "environment.yml" in the model and downloads them.
    # These downloads should go into a temp folder that will most likely not be included in the model itself,
    # because the AzureML run will later download the model into the same folder structure, and the file names might
    # clash.
    temp_folder = source_directory_path / "temp_for_scoring"
    conda_files = download_files_from_model(model_sas_urls,
                                            ENVIRONMENT_YAML_FILE_NAME,
                                            dir_path=temp_folder)
    if not conda_files:
        raise ValueError(
            "At least 1 Conda environment definition must exist in the model.")
    # Retrieve the name of the Python environment that the training run used. This environment should have been
    # registered. If no such environment exists, it will be re-create from the Conda files provided.
    python_environment_name = model.tags.get(PYTHON_ENVIRONMENT_NAME, "")
    # Copy the scoring script from the repository. This will start the model download from Azure, and invoke the
    # scoring script.
    entry_script = source_directory_path / Path(RUN_SCORING_SCRIPT).name
    shutil.copyfile(str(repository_root_directory(RUN_SCORING_SCRIPT)),
                    str(entry_script))
    source_config = SourceConfig(
        root_folder=source_directory_path,
        entry_script=entry_script,
        script_params=[
            "--model-folder",
            ".",
            "--model-id",
            model_id,
            SCORE_SCRIPT,
            # The data folder must be relative to the root folder of the AzureML job. test_image_files
            # is then just the file relative to the data_folder
            "--data_folder",
            image.parent.name,
            "--image_files",
            image.name,
            "--use_dicom",
            str(args.use_dicom)
        ],
        conda_dependencies_files=conda_files,
    )
    run_config = create_run_config(azure_config,
                                   source_config,
                                   environment_name=python_environment_name)
    exp = Experiment(workspace=workspace, name=args.experiment_name)
    run = exp.submit(run_config)
    logging.info(f"Submitted run {run.id} in experiment {run.experiment.name}")
    logging.info(f"Run URL: {run.get_portal_url()}")
    if not args.keep_upload_folder:
        source_directory.cleanup()
        logging.info(f"Deleted submission directory {source_directory_path}")
    if args.download_folder is None:
        return None
    logging.info("Awaiting run completion")
    run.wait_for_completion()
    logging.info(f"Run has completed with status {run.get_status()}")
    download_path = choose_download_path(args.download_folder)
    logging.info(f"Attempting to download segmentation to {download_path}")
    run.download_file(DEFAULT_RESULT_IMAGE_NAME, str(download_path))
    if download_path.exists():
        logging.info(f"Downloaded segmentation to {download_path}")
    else:
        logging.warning("Segmentation NOT downloaded")
    return download_path
Пример #19
0
def create_and_submit_experiment(azure_config: AzureConfig,
                                 script_run_config: ScriptRunConfig,
                                 commandline_args: str) -> Run:
    """
    Creates an AzureML experiment in the workspace and submits it for execution.
    :param azure_config: azure related configurations to setup a valid workspace.
    :param script_run_config: The configuration for the script that should be run inside of AzureML.
    :param commandline_args: A string with all commandline arguments that were provided to the runner. These are only
    used to set a tag on the submitted AzureML run.
    :returns: Run object for the submitted AzureML run
    """
    workspace = azure_config.get_workspace()
    experiment_name = create_experiment_name(azure_config)
    exp = Experiment(workspace=workspace,
                     name=azure_util.to_azure_friendly_string(experiment_name))

    # submit a training/testing run associated with the experiment
    run: Run = exp.submit(script_run_config)

    if is_offline_run_context(run):
        # This codepath will only be executed in unit tests, when exp.submit is mocked.
        return run

    # Set metadata for the run.
    set_run_tags(run, azure_config, commandline_args=commandline_args)

    print(
        "\n=============================================================================="
    )
    print(f"Successfully queued new run {run.id} in experiment: {exp.name}")

    if azure_config.run_recovery_id:
        print(f"\nRecovered from: {azure_config.run_recovery_id}")

    recovery_id = azure_util.create_run_recovery_id(run)
    recovery_file = Path(RUN_RECOVERY_FILE)
    if recovery_file.exists():
        recovery_file.unlink()
    recovery_file.write_text(recovery_id)

    print("Experiment URL: {}".format(exp.get_portal_url()))
    print("Run URL: {}".format(run.get_portal_url()))
    print(
        "If this run fails, re-start runner.py and supply these additional arguments: "
        f"--run_recovery_id={recovery_id}")
    print(
        f"The run recovery ID has been written to this file: {recovery_file}")
    print(
        "=============================================================================="
    )
    if azure_config.tensorboard and azure_config.azureml:
        print("Starting TensorBoard now because you specified --tensorboard")
        monitor(monitor_config=AMLTensorBoardMonitorConfig(run_ids=[run.id]),
                azure_config=azure_config)
    else:
        print(
            f"To monitor this run locally using TensorBoard, run the script: "
            f"InnerEye/Azure/tensorboard_monitor.py --run_ids={run.id}")
        print(
            "=============================================================================="
        )
    return run