Пример #1
0
def create_run_config(azure_config: AzureConfig,
                      source_config: SourceConfig,
                      all_azure_dataset_ids: List[str],
                      all_dataset_mountpoints: List[str],
                      environment_name: str = "") -> ScriptRunConfig:
    """
    Creates a configuration to run the InnerEye training script in AzureML.
    :param azure_config: azure related configurations to use for model scale-out behaviour
    :param source_config: configurations for model execution, such as name and execution mode
    :param all_azure_dataset_ids: The name of all datasets on blob storage that will be used for this run.
    :param all_dataset_mountpoints: When using the datasets in AzureML, these are the per-dataset mount points.
    :param environment_name: If specified, try to retrieve the existing Python environment with this name. If that
    is not found, create one from the Conda files provided in `source_config`. This parameter is meant to be used
    when running inference for an existing model.
    :return: The configured script run.
    """
    dataset_consumptions = create_dataset_consumptions(
        azure_config, all_azure_dataset_ids, all_dataset_mountpoints)
    # AzureML seems to sometimes expect the entry script path in Linux format, hence convert to posix path
    entry_script_relative_path = source_config.entry_script.relative_to(
        source_config.root_folder).as_posix()
    logging.info(
        f"Entry script {entry_script_relative_path} ({source_config.entry_script} relative to "
        f"source directory {source_config.root_folder})")
    max_run_duration = None
    if azure_config.max_run_duration:
        max_run_duration = run_duration_string_to_seconds(
            azure_config.max_run_duration)
    workspace = azure_config.get_workspace()
    run_config = RunConfiguration(
        script=entry_script_relative_path,
        arguments=source_config.script_params,
    )
    run_config.environment = get_or_create_python_environment(
        azure_config, source_config, environment_name=environment_name)
    run_config.target = azure_config.cluster
    run_config.max_run_duration_seconds = max_run_duration
    if azure_config.num_nodes > 1:
        distributed_job_config = MpiConfiguration(
            node_count=azure_config.num_nodes)
        run_config.mpi = distributed_job_config
        run_config.framework = "Python"
        run_config.communicator = "IntelMpi"
        run_config.node_count = distributed_job_config.node_count
    if len(dataset_consumptions) > 0:
        run_config.data = {
            dataset.name: dataset
            for dataset in dataset_consumptions
        }
    # Use blob storage for storing the source, rather than the FileShares section of the storage account.
    run_config.source_directory_data_store = workspace.datastores.get(
        WORKSPACE_DEFAULT_BLOB_STORE_NAME).name
    script_run_config = ScriptRunConfig(
        source_directory=str(source_config.root_folder),
        run_config=run_config,
    )
    if azure_config.hyperdrive:
        script_run_config = source_config.hyperdrive_config_func(
            script_run_config)  # type: ignore
    return script_run_config
    async def __create_cluster(self):
        self.__print_message("Setting up cluster")
        exp = Experiment(self.workspace, self.experiment_name)
        estimator = Estimator(
            os.path.join(self.abs_path, "setup"),
            compute_target=self.compute_target,
            entry_script="start_jupyter.py",
            environment_definition=self.environment_definition,
            script_params=self.scheduler_params,
            node_count=1,  ### start only scheduler
            distributed_training=MpiConfiguration(),
            use_docker=True,
            inputs=self.datastores,
        )
        run = exp.submit(estimator, tags=self.tags)

        self.__print_message("Waiting for compute cluster's IP")
        while (
            run.get_status() != "Canceled"
            and run.get_status() != "Failed"
            and "jupyter" not in run.get_metrics()  #and "scheduler" not in run.get_metrics()
        ):
            print(".", end="")
            logger.info("Compute Cluster not ready")
            time.sleep(5)

      
        if run.get_status() == "Canceled" or run.get_status() == "Failed":
            logger.exception("Failed to start the AzureML Compute Cluster")
            raise Exception("Failed to start the AzureML Compute Cluster.")

        print("\n\n")

        ### SET FLAGS
  
        ####---self.scheduler_ip_port = run.get_metrics()["scheduler"]
        self.scheduler_ip_port = run.get_metrics()["jupyter"]
        self.worker_params["--scheduler_ip_port"] = self.scheduler_ip_port
        self.__print_message(f'Scheduler: {run.get_metrics()["scheduler"]}')
        self.run = run

        logger.info(f'Scheduler: {run.get_metrics()["scheduler"]}')

        ### CHECK IF ON THE SAME VNET
        while self.same_vnet is None:
            self.__check_if_scheduler_ip_reachable()
            time.sleep(1)

        ### REQUIRED BY dask.distributed.deploy.cluster.Cluster
        ####---_scheduler = self.__prepare_rpc_connection_to_headnode()
        ####---self.scheduler_comm = rpc(_scheduler)
        ####---await self.sync(self.__setup_port_forwarding)
        ####---await self.sync(super()._start)
        _scheduler = self.__prepare_rpc_connection_to_headnode()
        self.__setup_port_forwarding()
        self.__update_links()    
        
        self.__print_message("Connections established")
Пример #3
0
def create_run_config(azure_config: AzureConfig,
                      source_config: SourceConfig,
                      azure_dataset_id: str = "",
                      environment_name: str = "") -> ScriptRunConfig:
    """
    Creates a configuration to run the InnerEye training script in AzureML.
    :param azure_config: azure related configurations to use for model scale-out behaviour
    :param source_config: configurations for model execution, such as name and execution mode
    :param azure_dataset_id: The name of the dataset in blob storage to be used for this run. This can be an empty
    string to not use any datasets.
    :param environment_name: If specified, try to retrieve the existing Python environment with this name. If that
    is not found, create one from the Conda files provided in `source_config`. This parameter is meant to be used
    when running inference for an existing model.
    :return: The configured script run.
    """
    if azure_dataset_id:
        azureml_dataset = get_or_create_dataset(azure_config, azure_dataset_id=azure_dataset_id)
        if not azureml_dataset:
            raise ValueError(f"AzureML dataset {azure_dataset_id} could not be found or created.")
        named_input = azureml_dataset.as_named_input(INPUT_DATA_KEY)
        dataset_consumption = named_input.as_mount() if azure_config.use_dataset_mount else named_input.as_download()
    else:
        dataset_consumption = None
    # AzureML seems to sometimes expect the entry script path in Linux format, hence convert to posix path
    entry_script_relative_path = source_config.entry_script.relative_to(source_config.root_folder).as_posix()
    logging.info(f"Entry script {entry_script_relative_path} ({source_config.entry_script} relative to "
                 f"source directory {source_config.root_folder})")
    max_run_duration = None
    if azure_config.max_run_duration:
        max_run_duration = run_duration_string_to_seconds(azure_config.max_run_duration)
    workspace = azure_config.get_workspace()
    run_config = RunConfiguration(
        script=entry_script_relative_path,
        arguments=source_config.script_params,
    )
    run_config.environment = get_or_create_python_environment(azure_config, source_config,
                                                              environment_name=environment_name)
    run_config.target = azure_config.cluster
    run_config.max_run_duration_seconds = max_run_duration
    if azure_config.num_nodes > 1:
        distributed_job_config = MpiConfiguration(node_count=azure_config.num_nodes)
        run_config.mpi = distributed_job_config
        run_config.framework = "Python"
        run_config.communicator = "IntelMpi"
        run_config.node_count = distributed_job_config.node_count
    if dataset_consumption:
        run_config.data = {dataset_consumption.name: dataset_consumption}
    # Use blob storage for storing the source, rather than the FileShares section of the storage account.
    run_config.source_directory_data_store = workspace.datastores.get(WORKSPACE_DEFAULT_BLOB_STORE_NAME).name
    script_run_config = ScriptRunConfig(
        source_directory=str(source_config.root_folder),
        run_config=run_config,
    )
    if azure_config.hyperdrive:
        script_run_config = source_config.hyperdrive_config_func(script_run_config)  # type: ignore
    return script_run_config
Пример #4
0
# Create image registry configuration
if experiment_settings["docker"]["custom_image"]:
    container_registry = ContainerRegistry()
    container_registry.address = experiment_settings["docker"][
        "custom_image_registry_details"]["address"]
    container_registry.username = experiment_settings["docker"][
        "custom_image_registry_details"]["username"]
    container_registry.password = experiment_settings["docker"][
        "custom_image_registry_details"]["password"]
else:
    container_registry = None

# Create disributed training configuration
if experiment_settings["distributed_training"]["backend_config"] == "mpi":
    distrib_training_backend = MpiConfiguration()
    distrib_training_backend.process_count_per_node = experiment_settings[
        "distributed_training"]["mpi"]["process_count_per_node"]
elif experiment_settings["distributed_training"][
        "backend_config"] == "parameter_server":
    distrib_training_backend = TensorflowConfiguration()
    distrib_training_backend.worker_count = experiment_settings[
        "distributed_training"]["parameter_server"]["worker_count"]
    distrib_training_backend.parameter_server_count = experiment_settings[
        "distributed_training"]["parameter_server"]["parameter_server_count"]
elif experiment_settings["distributed_training"]["backend_config"] == "gloo":
    distrib_training_backend = Gloo()
elif experiment_settings["distributed_training"]["backend_config"] == "nccl":
    distrib_training_backend = Nccl()
else:
    distrib_training_backend = None
def main():

    # Collect command line arguments
    args = parse_command_line_args()

    # Collect runclass and default (hot) dataset name
    dataset = sharedconfig.dataset_hot

    # Replace/update args for using premium storage
    if args.premium:
        dataset = sharedconfig.dataset_premium

    # Replace/update args for using cool storage
    if args.cool:
        dataset = sharedconfig.dataset_cool

    workspace = get_or_create_workspace(
        sharedconfig.subscription,
        sharedconfig.resource_group,
        sharedconfig.workspace_name,
    )

    # Get and update the ClusterConnector object
    # NOTE: This is *NOT* an azureml.core.compute.AmlCompute object but a wrapper
    # See clusterconnector.py for more details
    clusterconnector = create_or_update_cluster(
        workspace,
        sharedconfig.cluster_name,
        args.num_nodes,
        sharedconfig.ssh_key,
        sharedconfig.vm_type,
        terminate_on_failure=True,
        use_beeond=False,
    )

    # Get and update the AzureML Environment object
    environment = create_or_update_environment(workspace,
                                               sharedconfig.environment_name,
                                               sharedconfig.docker_image)

    # Get/Create an experiment object
    experiment = Experiment(workspace=workspace,
                            name=sharedconfig.experiment_name)

    # Configure the distributed compute settings
    pytorchconfig = MpiConfiguration(
        node_count=args.num_nodes,
        process_count_per_node=sharedconfig.gpus_per_node)

    # Collect arguments to be passed to training script
    script_args = ["--dataset", dataset]
    script_args.extend(
        generate_training_opts(
            args.num_nodes * sharedconfig.gpus_per_node,
            sharedconfig.ims_per_gpu,
            args.iter,
        ))
    script_args.extend(["PATHS_CATALOG", "./dataset_catalog.py"])

    # Define the configuration for running the training script
    script_conf = ScriptRunConfig(
        source_directory="train",
        script="train_net_download.py",
        compute_target=clusterconnector.cluster,
        environment=environment,
        arguments=script_args,
        distributed_job_config=pytorchconfig,
    )

    # We can use these tags make a note of run parameters (avoids grepping the logs)
    runtags = {
        "class": k_runclass,
        "vmtype": sharedconfig.vm_type,
        "num_nodes": args.num_nodes,
        "ims_per_gpu": sharedconfig.ims_per_gpu,
        "iter": args.iter,
    }

    # Submit the run
    run = experiment.submit(config=script_conf, tags=runtags)

    # Can optionally choose to follow the output on the command line
    if args.follow:
        run.wait_for_completion(show_output=True)
Пример #6
0
def get_pipeline(aml_compute: ComputeTarget, blob_ds: Datastore,
                 batch_env: Environment, tf_env: Environment) -> str:
    """
    Creates pipeline steps
    Parameters:
        aml_compute (ComputeTarget): a reference to a compute
        blob_ds (DataStore): a reference to a datastore
        batch_env (Environment): a reference to environment object
        tf_env (Environment): a horovod/tf environment
    Returns:
        string: a set of pipeline steps
    """

    # We need something to generate data by the way
    pipeline_files = PipelineData("pipeline_files",
                                  datastore=blob_ds).as_dataset()

    # Pipeline parameters to use with every run
    is_debug = PipelineParameter("is_debug", default_value=False)
    relay_connection_name = PipelineParameter("debug_relay_connection_name",
                                              default_value="none")

    single_step_config = RunConfiguration()
    single_step_config.environment = batch_env
    single_step = PythonScriptStep(
        name=f"single-step",
        script_name="samples/azure_ml_advanced/steps/single_step.py",
        source_directory=".",
        runconfig=single_step_config,
        arguments=[
            "--pipeline-files", pipeline_files, "--is-debug", is_debug,
            "--debug-relay-connection-name", relay_connection_name,
            "--debug-port", 5678, "--debug-relay-connection-string-secret",
            debug_connection_string_secret_name
        ],
        inputs=[],
        outputs=[pipeline_files],
        compute_target=aml_compute,
        allow_reuse=False)

    output_dir = PipelineData("output_dir")

    parallel_run_config = ParallelRunConfig(
        entry_script="samples/azure_ml_advanced/steps/parallel_step.py",
        source_directory=".",
        mini_batch_size="5",
        output_action="summary_only",
        environment=batch_env,
        compute_target=aml_compute,
        error_threshold=10,
        run_invocation_timeout=600,  # very important for debugging
        node_count=2,
        process_count_per_node=1)

    parallelrun_step = ParallelRunStep(
        name="parallel-run-step",
        parallel_run_config=parallel_run_config,
        inputs=[pipeline_files],
        output=output_dir,
        arguments=[
            "--is-debug", is_debug, "--debug-relay-connection-name",
            relay_connection_name, "--debug-port", 5679,
            "--debug-relay-connection-string-secret",
            debug_connection_string_secret_name
        ],
        allow_reuse=False)

    parallelrun_step.run_after(single_step)

    distr_config = MpiConfiguration(process_count_per_node=1, node_count=2)

    src = ScriptRunConfig(
        source_directory=".",
        script="samples/azure_ml_advanced/steps/mpi/mpi_step_starter.py",
        arguments=[
            "--input-ds", pipeline_files, "--is-debug", is_debug,
            "--debug-relay-connection-name", relay_connection_name,
            "--debug-port", 5680, "--debug-relay-connection-string-secret",
            debug_connection_string_secret_name
        ],
        compute_target=compute_name,
        environment=tf_env,
        distributed_job_config=distr_config,
    )

    mpi_step = PythonScriptStep(
        name="mpi-step",
        script_name="samples/azure_ml_advanced/steps/mpi/mpi_step_starter.py",
        arguments=[
            "--input-ds", pipeline_files, "--is-debug", is_debug,
            "--debug-relay-connection-name", relay_connection_name,
            "--debug-port", 5680, "--debug-relay-connection-string-secret",
            debug_connection_string_secret_name
        ],
        compute_target=aml_compute,
        inputs=[pipeline_files],
        outputs=[],
        runconfig=src.run_config,
        source_directory=".")

    mpi_step.run_after(parallelrun_step)

    print("Pipeline Steps Created")

    steps = [single_step, parallelrun_step, mpi_step]

    print(f"Returning {len(steps)} steps")
    return steps
Пример #7
0
        compute_target = ComputeTarget(workspace=ws, name=compute_name)
    except ComputeTargetException:
        compute_config = AmlCompute.provisioning_configuration(
            vm_size='STANDARD_NC6', min_nodes=0, max_nodes=6)
        compute_target = ComputeTarget.create(ws, compute_name, compute_config)
        compute_target.wait_for_completion(show_output=True)

    run_conf.target = compute_target
    run_conf.environment.docker.enabled = True
    run_conf.environment.docker.base_image = DEFAULT_CPU_IMAGE
    run_conf.environment.python.conda_dependencies = \
        CondaDependencies(conda_dependencies_file_path='env.yml')
    run_conf.environment.python.user_managed_dependencies = False
    if cv:
        run_conf.communicator = 'OpenMPI'
        run_conf.mpi = MpiConfiguration()
        run_conf.node_count = cv + 2
exp = Experiment(workspace=ws, name=config['experiment_name'])

use_estimator = True
if use_estimator:
    if cv:
        script_params = {'--cv': cv}
        node_count = cv + 2  # dask-mpi uses 2 nodes for its scheduler and client
        distributed_training = MpiConfiguration()
    else:
        script_params = None
        node_count = None
        distributed_training = None
    to_run = Estimator(source_directory='.',
                       compute_target=compute_target,
Пример #8
0
# azure ml settings
experiment_name = "tensorflow-mnist-distributed"
compute_name = "gpu-8x-a100"

# environment
env = Environment.get(
    workspace=ws,
    name="AzureML-tensorflow-2.4-ubuntu18.04-py37-cuda11-gpu").clone(
        "tensorflow-2.4-gpu")

# Experiment configuration
node_count = 2  # number of nodes
process_count_per_node = 8  # number of GPUs per node

# create distributed config
distr_config = MpiConfiguration(process_count_per_node=process_count_per_node,
                                node_count=node_count)

# create arguments
args = ["--epochs", 5]

# create job config
src = ScriptRunConfig(
    source_directory=source_dir,
    script=script_name,
    arguments=args,
    compute_target=compute_name,
    environment=env,
    distributed_job_config=distr_config,
)

# submit job
Пример #9
0
    # define script parameters
    script_params_3 = {
        '--models': models,
        '--data_folder_train':
        dataset_train.as_named_input('train').as_mount(),
        '--data_folder_test': dataset_test.as_named_input('test').as_mount(),
        '--local': 'no'
    }

    estimator = PyTorch(
        entry_script='train.py',
        script_params=script_params_3,
        source_directory=os.path.dirname(os.path.realpath(__file__)),
        compute_target=workspace.compute_targets["alwaysoncluster"],
        distributed_training=MpiConfiguration(),
        framework_version='1.4',
        use_gpu=True,
        pip_packages=[
            'numpy==1.15.4', 'pandas==0.23.4', 'scikit-learn==0.20.1',
            'scipy==1.0.0', 'matplotlib==3.0.2', 'utils==0.9.0',
            'onnxruntime==1.2.0', 'onnx==1.6.0'
        ])

    experiment = Experiment(workspace=workspace, name="deeplearning")
    run = experiment.submit(estimator)

    if hyperdrive is True:
        # Define multi-run configuration
        hyperdrive_run_config = HyperDriveConfig(
            estimator=estimator,
Пример #10
0
    async def __create_cluster(self):
        self.__print_message("Setting up cluster")
        run = None
        if self.parent_run:
            ## scheduler run as child run
            run_config = RunConfiguration()
            run_config.environment = self.environment_definition
            run_config.target = self.compute_target
            args = []
            for key, value in self.scheduler_params.items():
                args.append(f"{key}={value}")

            file_dataset_registered_name = self.kwargs.get(
                'file_dataset_registered_name', None)
            dataset_config_name = self.kwargs.get('dataset_config_name', None)
            path_on_compute = self.kwargs.get('path_on_compute', None)
            if path_on_compute is not None:
                dataset = Dataset.get_by_name(
                    workspace=self.workspace,
                    name=file_dataset_registered_name)
                input1 = dataset.as_named_input(dataset_config_name).as_mount(
                    path_on_compute=path_on_compute)
                args.append(input1)

            child_run_config = ScriptRunConfig(
                source_directory=os.path.join(self.abs_path, "setup"),
                script="start_scheduler.py",
                arguments=args,
                run_config=run_config,
            )
            run = self.parent_run.submit_child(child_run_config,
                                               tags=self.tags)
        else:
            # submit scheduler run
            exp = Experiment(self.workspace, self.experiment_name)
            estimator = Estimator(
                os.path.join(self.abs_path, "setup"),
                compute_target=self.compute_target,
                entry_script="start_scheduler.py",
                environment_definition=self.environment_definition,
                script_params=self.scheduler_params,
                node_count=1,  ### start only scheduler
                distributed_training=MpiConfiguration(),
                use_docker=True,
                inputs=self.datastores,
            )

            run = exp.submit(estimator, tags=self.tags)

        self.__print_message("Waiting for scheduler node's IP")

        while (run.get_status() != "Canceled" and run.get_status() != "Failed"
               and "scheduler" not in run.get_metrics()):
            print(".", end="")
            logger.info("Scheduler not ready")
            time.sleep(5)

        if run.get_status() == "Canceled" or run.get_status() == "Failed":
            logger.exception("Failed to start the AzureML cluster")
            raise Exception("Failed to start the AzureML cluster.")

        self.run = run
        print("\n\n")

        ### SET FLAGS
        self.scheduler_ip_port = run.get_metrics()["scheduler"]
        self.worker_params["--scheduler_ip_port"] = self.scheduler_ip_port
        self.__print_message(f'Scheduler: {run.get_metrics()["scheduler"]}')

        logger.info(f'Scheduler: {run.get_metrics()["scheduler"]}')

        ### CHECK IF ON THE SAME VNET
        print("check if on the same vnet")
        while self.same_vnet is None:
            await self.sync(self.__check_if_scheduler_ip_reachable)
            time.sleep(1)

        ### REQUIRED BY dask.distributed.deploy.cluster.Cluster
        _scheduler = self.__prepare_rpc_connection_to_headnode()
        self.scheduler_comm = rpc(_scheduler)
        await self.sync(self.__setup_port_forwarding)
        await self.sync(super()._start)
        await self.sync(self.__update_links)

        self.__print_message("Connections established")
        self.__print_message(f"Scaling to {self.initial_node_count} workers")

        # LOGIC TO KEEP PROPER TRACK OF WORKERS IN `scale`
        if self.initial_node_count > 1:
            self.scale(self.initial_node_count)
        self.__print_message(f"Scaling is done")
Пример #11
0
def main():

    parser = argparse.ArgumentParser(
        description="Run Elbencho on a BeeOND enabled cluster"
    )

    parser.add_argument("num_nodes", type=int, help="Number of nodes")
    parser.add_argument("--follow", action="store_true", help="Follow run output")
    parser.add_argument(
        "--keep-cluster",
        action="store_true",
        help="Don't autoscale cluster down when idle (after run completed)",
    )
    parser.add_argument(
        "--keep-failed-cluster", dest="terminate_on_failure", action="store_false"
    )

    parser.add_argument("--sharedfiles", action="store_false", dest="multifile")

    args = parser.parse_args()

    workspace = get_or_create_workspace(
        sharedconfig.subscription_id,
        sharedconfig.resource_group_name,
        sharedconfig.workspace_name,
        sharedconfig.location,
    )

    try:
        clusterconnector = create_or_update_cluster(
            workspace,
            sharedconfig.cluster_name,
            args.num_nodes,
            sharedconfig.ssh_key,
            sharedconfig.vm_type,
            terminate_on_failure=args.terminate_on_failure,
            use_beeond=True,
        )
    except RuntimeError:
        cprint("Fatal Error - exiting", "red", attrs=["bold"])
        sys.exit(-1)

    docker_args = [
        "-v",
        "{}:{}".format(clusterconnector.beeond_mnt, sharedconfig.beeond_map),
    ]

    # Get and update the AzureML Environment object
    environment = create_or_update_environment(
        workspace, sharedconfig.environment_name, sharedconfig.docker_image, docker_args
    )

    # Get/Create an experiment object
    experiment = Experiment(workspace=workspace, name=sharedconfig.experiment_name)

    # Configure the distributed compute settings
    parallelconfig = MpiConfiguration(
        node_count=args.num_nodes, process_count_per_node=1
    )

    if args.multifile:
        runscript = "./run_elbencho_multifile.sh"
    else:
        runscript = "./run_elbencho_largefile.sh"

    # Collect arguments to be passed to elbencho script
    script_args = [
        "bash",
        runscript,
        sharedconfig.beeond_map,
        str(args.num_nodes),
        *clusterconnector.ibaddrs,
    ]

    # Define the configuration for running the training script
    script_conf = ScriptRunConfig(
        source_directory="scripts",
        command=script_args,
        compute_target=clusterconnector.cluster,
        environment=environment,
        distributed_job_config=parallelconfig,
    )

    # We can use these tags make a note of run parameters (avoids grepping the logs)
    runtags = {
        "class": k_runclass,
        "vmtype": sharedconfig.vm_type,
        "num_nodes": args.num_nodes,
        "run_type": "multifile" if args.multifile else "sharedfile",
    }

    # Submit the run
    run = experiment.submit(config=script_conf, tags=runtags)

    # Can optionally choose to follow the output on the command line
    if args.follow:
        run.wait_for_completion(show_output=True)
Пример #12
0
    # show up if you view the cl help
    args, args_to_pass_on = parser.parse_known_args()

    ws = Workspace.from_config()
    experiment = Experiment(workspace=ws, name=args.name)

    # Since for the training script directories are relative to the training script
    params_path = os.path.join("src", args.params_path)
    with open(params_path) as f:
        params = yaml.safe_load(f)

    params.update(vars(args))

    node_count = params["num_gpus"] if params["use_hvd"] else 1

    distr_config = MpiConfiguration(node_count=node_count)

    config = ScriptRunConfig(
        source_directory="./src",
        script="model/train.py",
        compute_target=args.compute_target,
        distributed_job_config=distr_config,
        arguments=["--params-path", args.params_path] + args_to_pass_on,
    )

    config.run_config.environment = load_azml_env()

    run = experiment.submit(config)
    if args.run_label is not None:
        run.display_name = args.run_label
Пример #13
0
    registry_address = acr.group(1)  # onnxtraining.azurecr.io
    registry_name = acr.group(2)  # onnxtraining
    container_image = acr.group(3)  # azureml/bert:latest

    registry_client = get_client_from_cli_profile(
        ContainerRegistryManagementClient, subscription_id=args.subscription)
    registry_credentials = registry_client.registries.list_credentials(
        args.container_registry_resource_group, registry_name)

    registry_details = ContainerRegistry()
    registry_details.address = registry_address
    registry_details.username = registry_credentials.username
    registry_details.password = registry_credentials.passwords[0].value

# MPI configuration if executing a distributed run
mpi = MpiConfiguration()
mpi.process_count_per_node = args.gpu_count

# AzureML Estimator that describes how to run the Experiment
estimator = Estimator(source_directory='./',
                      script_params=script_params,
                      compute_target=compute_target,
                      node_count=args.node_count,
                      distributed_training=mpi,
                      image_registry_details=registry_details,
                      use_docker=True,
                      custom_docker_image=container_image,
                      entry_script='train.py',
                      inputs=[ds.path('./').as_mount()])

# Start the AzureML Experiment
Пример #14
0
ws = Workspace("48bbc269-ce89-4f6f-9a12-c6f91fcb772d", "aml1p-rg",
               "aml1p-ml-wus2")

env = Environment.from_conda_specification("hydra-pl", "environment.yml")
env.docker.enabled = True
env.docker.base_image = (
    "mcr.microsoft.com/azureml/openmpi3.1.2-cuda10.2-cudnn8-ubuntu18.04")

# ==============================================================================
node_count = 2
gpus_per_node = -1
cluster = "gpu-nc24-lowpri"
# ==============================================================================

mpi_config = MpiConfiguration(process_count_per_node=1, node_count=node_count)

config = ScriptRunConfig(
    source_directory=".",
    script="train.py",
    compute_target=cluster,
    distributed_job_config=mpi_config,
    environment=env,
    arguments=[
        f"trainer.gpus={gpus_per_node}",
        f"trainer.num_nodes={node_count}",
        "+trainer.accelerator=ddp",
    ],
)

exp = Experiment(ws, "azuremlv2")
Пример #15
0
    async def __create_cluster(self):
        self.__print_message("Setting up cluster")
        exp = Experiment(self.workspace, self.experiment_name)
        estimator = Estimator(
            os.path.join(self.abs_path, "setup"),
            compute_target=self.compute_target,
            entry_script="start_scheduler.py",
            environment_definition=self.environment_definition,
            script_params=self.scheduler_params,
            node_count=1,  ### start only scheduler
            distributed_training=MpiConfiguration(),
            use_docker=True,
            inputs=self.datastores,
        )

        run = exp.submit(estimator, tags=self.tags)

        self.__print_message("Waiting for scheduler node's IP")
        status = run.get_status()
        while (
            status != "Canceled"
            and status != "Failed"
            and "scheduler" not in run.get_metrics()
        ):
            print(".", end="")
            logger.info("Scheduler not ready")
            time.sleep(5)
            status = run.get_status()

        if status == "Canceled" or status == "Failed":
            run_error = run.get_details().get("error")
            error_message = "Failed to start the AzureML cluster."

            if run_error:
                error_message = "{} {}".format(error_message, run_error)
            logger.exception(error_message)

            if not self.compute_target_set:
                self.__delete_compute_target()

            raise Exception(error_message)

        print("\n")

        ### SET FLAGS
        self.scheduler_ip_port = run.get_metrics()["scheduler"]
        self.worker_params["--scheduler_ip_port"] = self.scheduler_ip_port
        self.__print_message(f'Scheduler: {run.get_metrics()["scheduler"]}')
        self.run = run

        ### CHECK IF ON THE SAME VNET
        max_retry = 5
        while self.same_vnet is None and max_retry > 0:
            time.sleep(5)
            await self.sync(self.__check_if_scheduler_ip_reachable)
            max_retry -= 1

        if self.same_vnet is None:
            self.run.cancel()
            if not self.compute_target_set:
                self.__delete_compute_target()
            logger.exception(
                "Connection error after retrying. Failed to start the AzureML cluster."
            )
            return

        ### REQUIRED BY dask.distributed.deploy.cluster.Cluster
        self.hostname = socket.gethostname()
        self.is_in_ci = (
            f"/mnt/batch/tasks/shared/LS_root/mounts/clusters/{self.hostname}"
            in os.getcwd()
        )
        _scheduler = self.__prepare_rpc_connection_to_headnode()
        self.scheduler_comm = rpc(_scheduler)
        await self.sync(self.__setup_port_forwarding)

        try:
            await super()._start()
        except Exception as e:
            logger.exception(e)
            # CLEAN UP COMPUTE TARGET
            self.run.cancel()
            if not self.compute_target_set:
                self.__delete_compute_target()
            return

        await self.sync(self.__update_links)

        self.__print_message("Connections established")
        self.__print_message(f"Scaling to {self.initial_node_count} workers")

        if self.initial_node_count > 1:
            self.scale(
                self.initial_node_count
            )  # LOGIC TO KEEP PROPER TRACK OF WORKERS IN `scale`
        self.__print_message("Scaling is done")
Пример #16
0
def get_distributed_job_config(args: JobArguments):
    n_proc = TARGET_GPU_COUNT[args.target_name]
    distributed_job_config = MpiConfiguration(process_count_per_node=n_proc,
                                              node_count=args.node_count)
    return distributed_job_config
Пример #17
0
env.python.user_managed_dependencies = True
env.python.interpreter_path = "/opt/miniconda/bin/python"

# To install any Python packages you need, simply add RUN pip install package-name to the docker string. E.g. `RUN pip install sklearn`
# Specify docker steps as a string and use the base DeepSpeed Docker image
dockerfile = r"""
FROM deepspeed/base-aml:with-pt-ds-and-deps
RUN pip install azureml-mlflow
RUN echo "Welcome to the DeepSpeed custom environment!"
"""

# set base image to None, because the image is defined by dockerfile.
env.docker.base_image = None
env.docker.base_dockerfile = dockerfile

# create job config
mpi_config = MpiConfiguration(node_count=2, process_count_per_node=2)

src = ScriptRunConfig(
    source_directory=script_dir,
    script=script_name,
    arguments=arguments,
    environment=env,
    compute_target=compute_name,
    distributed_job_config=mpi_config,
)

# submit job
run = Experiment(ws, experiment_name).submit(src)
run.wait_for_completion(show_output=True)
Пример #18
0
    parser.add_argument("-g", "--num-gpus", default=20)
    args = parser.parse_args()

    ws = Workspace.from_config()
    experiment = Experiment(workspace=ws, name=args.experiment)

    remote_model_file = (
        os.path.join("outputs", args.model_file)
        if not args.model_file.startswith("outputs")
        else args.model_file
    )
    model_file = f"data/azml/{args.run_id}_{args.model_file}.pt"
    if not os.path.exists(model_file):
        download_model_file(args.run_id, remote_model_file, model_file)

    distr_config = MpiConfiguration(node_count=args.num_gpus)
    config = ScriptRunConfig(
        source_directory="./src",
        script="model/predict_hvd.py",
        compute_target=args.compute_target,
        distributed_job_config=distr_config,
        arguments=[
            "--aoi",
            args.aoi_file,
            "--feature-file",
            args.feature_file,
            "--model-file",
            os.path.basename(model_file),
        ],
        max_run_duration_seconds=60 * 30,
        environment=load_azml_env(),