def create_run_config(azure_config: AzureConfig, source_config: SourceConfig, all_azure_dataset_ids: List[str], all_dataset_mountpoints: List[str], environment_name: str = "") -> ScriptRunConfig: """ Creates a configuration to run the InnerEye training script in AzureML. :param azure_config: azure related configurations to use for model scale-out behaviour :param source_config: configurations for model execution, such as name and execution mode :param all_azure_dataset_ids: The name of all datasets on blob storage that will be used for this run. :param all_dataset_mountpoints: When using the datasets in AzureML, these are the per-dataset mount points. :param environment_name: If specified, try to retrieve the existing Python environment with this name. If that is not found, create one from the Conda files provided in `source_config`. This parameter is meant to be used when running inference for an existing model. :return: The configured script run. """ dataset_consumptions = create_dataset_consumptions( azure_config, all_azure_dataset_ids, all_dataset_mountpoints) # AzureML seems to sometimes expect the entry script path in Linux format, hence convert to posix path entry_script_relative_path = source_config.entry_script.relative_to( source_config.root_folder).as_posix() logging.info( f"Entry script {entry_script_relative_path} ({source_config.entry_script} relative to " f"source directory {source_config.root_folder})") max_run_duration = None if azure_config.max_run_duration: max_run_duration = run_duration_string_to_seconds( azure_config.max_run_duration) workspace = azure_config.get_workspace() run_config = RunConfiguration( script=entry_script_relative_path, arguments=source_config.script_params, ) run_config.environment = get_or_create_python_environment( azure_config, source_config, environment_name=environment_name) run_config.target = azure_config.cluster run_config.max_run_duration_seconds = max_run_duration if azure_config.num_nodes > 1: distributed_job_config = MpiConfiguration( node_count=azure_config.num_nodes) run_config.mpi = distributed_job_config run_config.framework = "Python" run_config.communicator = "IntelMpi" run_config.node_count = distributed_job_config.node_count if len(dataset_consumptions) > 0: run_config.data = { dataset.name: dataset for dataset in dataset_consumptions } # Use blob storage for storing the source, rather than the FileShares section of the storage account. run_config.source_directory_data_store = workspace.datastores.get( WORKSPACE_DEFAULT_BLOB_STORE_NAME).name script_run_config = ScriptRunConfig( source_directory=str(source_config.root_folder), run_config=run_config, ) if azure_config.hyperdrive: script_run_config = source_config.hyperdrive_config_func( script_run_config) # type: ignore return script_run_config
async def __create_cluster(self): self.__print_message("Setting up cluster") exp = Experiment(self.workspace, self.experiment_name) estimator = Estimator( os.path.join(self.abs_path, "setup"), compute_target=self.compute_target, entry_script="start_jupyter.py", environment_definition=self.environment_definition, script_params=self.scheduler_params, node_count=1, ### start only scheduler distributed_training=MpiConfiguration(), use_docker=True, inputs=self.datastores, ) run = exp.submit(estimator, tags=self.tags) self.__print_message("Waiting for compute cluster's IP") while ( run.get_status() != "Canceled" and run.get_status() != "Failed" and "jupyter" not in run.get_metrics() #and "scheduler" not in run.get_metrics() ): print(".", end="") logger.info("Compute Cluster not ready") time.sleep(5) if run.get_status() == "Canceled" or run.get_status() == "Failed": logger.exception("Failed to start the AzureML Compute Cluster") raise Exception("Failed to start the AzureML Compute Cluster.") print("\n\n") ### SET FLAGS ####---self.scheduler_ip_port = run.get_metrics()["scheduler"] self.scheduler_ip_port = run.get_metrics()["jupyter"] self.worker_params["--scheduler_ip_port"] = self.scheduler_ip_port self.__print_message(f'Scheduler: {run.get_metrics()["scheduler"]}') self.run = run logger.info(f'Scheduler: {run.get_metrics()["scheduler"]}') ### CHECK IF ON THE SAME VNET while self.same_vnet is None: self.__check_if_scheduler_ip_reachable() time.sleep(1) ### REQUIRED BY dask.distributed.deploy.cluster.Cluster ####---_scheduler = self.__prepare_rpc_connection_to_headnode() ####---self.scheduler_comm = rpc(_scheduler) ####---await self.sync(self.__setup_port_forwarding) ####---await self.sync(super()._start) _scheduler = self.__prepare_rpc_connection_to_headnode() self.__setup_port_forwarding() self.__update_links() self.__print_message("Connections established")
def create_run_config(azure_config: AzureConfig, source_config: SourceConfig, azure_dataset_id: str = "", environment_name: str = "") -> ScriptRunConfig: """ Creates a configuration to run the InnerEye training script in AzureML. :param azure_config: azure related configurations to use for model scale-out behaviour :param source_config: configurations for model execution, such as name and execution mode :param azure_dataset_id: The name of the dataset in blob storage to be used for this run. This can be an empty string to not use any datasets. :param environment_name: If specified, try to retrieve the existing Python environment with this name. If that is not found, create one from the Conda files provided in `source_config`. This parameter is meant to be used when running inference for an existing model. :return: The configured script run. """ if azure_dataset_id: azureml_dataset = get_or_create_dataset(azure_config, azure_dataset_id=azure_dataset_id) if not azureml_dataset: raise ValueError(f"AzureML dataset {azure_dataset_id} could not be found or created.") named_input = azureml_dataset.as_named_input(INPUT_DATA_KEY) dataset_consumption = named_input.as_mount() if azure_config.use_dataset_mount else named_input.as_download() else: dataset_consumption = None # AzureML seems to sometimes expect the entry script path in Linux format, hence convert to posix path entry_script_relative_path = source_config.entry_script.relative_to(source_config.root_folder).as_posix() logging.info(f"Entry script {entry_script_relative_path} ({source_config.entry_script} relative to " f"source directory {source_config.root_folder})") max_run_duration = None if azure_config.max_run_duration: max_run_duration = run_duration_string_to_seconds(azure_config.max_run_duration) workspace = azure_config.get_workspace() run_config = RunConfiguration( script=entry_script_relative_path, arguments=source_config.script_params, ) run_config.environment = get_or_create_python_environment(azure_config, source_config, environment_name=environment_name) run_config.target = azure_config.cluster run_config.max_run_duration_seconds = max_run_duration if azure_config.num_nodes > 1: distributed_job_config = MpiConfiguration(node_count=azure_config.num_nodes) run_config.mpi = distributed_job_config run_config.framework = "Python" run_config.communicator = "IntelMpi" run_config.node_count = distributed_job_config.node_count if dataset_consumption: run_config.data = {dataset_consumption.name: dataset_consumption} # Use blob storage for storing the source, rather than the FileShares section of the storage account. run_config.source_directory_data_store = workspace.datastores.get(WORKSPACE_DEFAULT_BLOB_STORE_NAME).name script_run_config = ScriptRunConfig( source_directory=str(source_config.root_folder), run_config=run_config, ) if azure_config.hyperdrive: script_run_config = source_config.hyperdrive_config_func(script_run_config) # type: ignore return script_run_config
# Create image registry configuration if experiment_settings["docker"]["custom_image"]: container_registry = ContainerRegistry() container_registry.address = experiment_settings["docker"][ "custom_image_registry_details"]["address"] container_registry.username = experiment_settings["docker"][ "custom_image_registry_details"]["username"] container_registry.password = experiment_settings["docker"][ "custom_image_registry_details"]["password"] else: container_registry = None # Create disributed training configuration if experiment_settings["distributed_training"]["backend_config"] == "mpi": distrib_training_backend = MpiConfiguration() distrib_training_backend.process_count_per_node = experiment_settings[ "distributed_training"]["mpi"]["process_count_per_node"] elif experiment_settings["distributed_training"][ "backend_config"] == "parameter_server": distrib_training_backend = TensorflowConfiguration() distrib_training_backend.worker_count = experiment_settings[ "distributed_training"]["parameter_server"]["worker_count"] distrib_training_backend.parameter_server_count = experiment_settings[ "distributed_training"]["parameter_server"]["parameter_server_count"] elif experiment_settings["distributed_training"]["backend_config"] == "gloo": distrib_training_backend = Gloo() elif experiment_settings["distributed_training"]["backend_config"] == "nccl": distrib_training_backend = Nccl() else: distrib_training_backend = None
def main(): # Collect command line arguments args = parse_command_line_args() # Collect runclass and default (hot) dataset name dataset = sharedconfig.dataset_hot # Replace/update args for using premium storage if args.premium: dataset = sharedconfig.dataset_premium # Replace/update args for using cool storage if args.cool: dataset = sharedconfig.dataset_cool workspace = get_or_create_workspace( sharedconfig.subscription, sharedconfig.resource_group, sharedconfig.workspace_name, ) # Get and update the ClusterConnector object # NOTE: This is *NOT* an azureml.core.compute.AmlCompute object but a wrapper # See clusterconnector.py for more details clusterconnector = create_or_update_cluster( workspace, sharedconfig.cluster_name, args.num_nodes, sharedconfig.ssh_key, sharedconfig.vm_type, terminate_on_failure=True, use_beeond=False, ) # Get and update the AzureML Environment object environment = create_or_update_environment(workspace, sharedconfig.environment_name, sharedconfig.docker_image) # Get/Create an experiment object experiment = Experiment(workspace=workspace, name=sharedconfig.experiment_name) # Configure the distributed compute settings pytorchconfig = MpiConfiguration( node_count=args.num_nodes, process_count_per_node=sharedconfig.gpus_per_node) # Collect arguments to be passed to training script script_args = ["--dataset", dataset] script_args.extend( generate_training_opts( args.num_nodes * sharedconfig.gpus_per_node, sharedconfig.ims_per_gpu, args.iter, )) script_args.extend(["PATHS_CATALOG", "./dataset_catalog.py"]) # Define the configuration for running the training script script_conf = ScriptRunConfig( source_directory="train", script="train_net_download.py", compute_target=clusterconnector.cluster, environment=environment, arguments=script_args, distributed_job_config=pytorchconfig, ) # We can use these tags make a note of run parameters (avoids grepping the logs) runtags = { "class": k_runclass, "vmtype": sharedconfig.vm_type, "num_nodes": args.num_nodes, "ims_per_gpu": sharedconfig.ims_per_gpu, "iter": args.iter, } # Submit the run run = experiment.submit(config=script_conf, tags=runtags) # Can optionally choose to follow the output on the command line if args.follow: run.wait_for_completion(show_output=True)
def get_pipeline(aml_compute: ComputeTarget, blob_ds: Datastore, batch_env: Environment, tf_env: Environment) -> str: """ Creates pipeline steps Parameters: aml_compute (ComputeTarget): a reference to a compute blob_ds (DataStore): a reference to a datastore batch_env (Environment): a reference to environment object tf_env (Environment): a horovod/tf environment Returns: string: a set of pipeline steps """ # We need something to generate data by the way pipeline_files = PipelineData("pipeline_files", datastore=blob_ds).as_dataset() # Pipeline parameters to use with every run is_debug = PipelineParameter("is_debug", default_value=False) relay_connection_name = PipelineParameter("debug_relay_connection_name", default_value="none") single_step_config = RunConfiguration() single_step_config.environment = batch_env single_step = PythonScriptStep( name=f"single-step", script_name="samples/azure_ml_advanced/steps/single_step.py", source_directory=".", runconfig=single_step_config, arguments=[ "--pipeline-files", pipeline_files, "--is-debug", is_debug, "--debug-relay-connection-name", relay_connection_name, "--debug-port", 5678, "--debug-relay-connection-string-secret", debug_connection_string_secret_name ], inputs=[], outputs=[pipeline_files], compute_target=aml_compute, allow_reuse=False) output_dir = PipelineData("output_dir") parallel_run_config = ParallelRunConfig( entry_script="samples/azure_ml_advanced/steps/parallel_step.py", source_directory=".", mini_batch_size="5", output_action="summary_only", environment=batch_env, compute_target=aml_compute, error_threshold=10, run_invocation_timeout=600, # very important for debugging node_count=2, process_count_per_node=1) parallelrun_step = ParallelRunStep( name="parallel-run-step", parallel_run_config=parallel_run_config, inputs=[pipeline_files], output=output_dir, arguments=[ "--is-debug", is_debug, "--debug-relay-connection-name", relay_connection_name, "--debug-port", 5679, "--debug-relay-connection-string-secret", debug_connection_string_secret_name ], allow_reuse=False) parallelrun_step.run_after(single_step) distr_config = MpiConfiguration(process_count_per_node=1, node_count=2) src = ScriptRunConfig( source_directory=".", script="samples/azure_ml_advanced/steps/mpi/mpi_step_starter.py", arguments=[ "--input-ds", pipeline_files, "--is-debug", is_debug, "--debug-relay-connection-name", relay_connection_name, "--debug-port", 5680, "--debug-relay-connection-string-secret", debug_connection_string_secret_name ], compute_target=compute_name, environment=tf_env, distributed_job_config=distr_config, ) mpi_step = PythonScriptStep( name="mpi-step", script_name="samples/azure_ml_advanced/steps/mpi/mpi_step_starter.py", arguments=[ "--input-ds", pipeline_files, "--is-debug", is_debug, "--debug-relay-connection-name", relay_connection_name, "--debug-port", 5680, "--debug-relay-connection-string-secret", debug_connection_string_secret_name ], compute_target=aml_compute, inputs=[pipeline_files], outputs=[], runconfig=src.run_config, source_directory=".") mpi_step.run_after(parallelrun_step) print("Pipeline Steps Created") steps = [single_step, parallelrun_step, mpi_step] print(f"Returning {len(steps)} steps") return steps
compute_target = ComputeTarget(workspace=ws, name=compute_name) except ComputeTargetException: compute_config = AmlCompute.provisioning_configuration( vm_size='STANDARD_NC6', min_nodes=0, max_nodes=6) compute_target = ComputeTarget.create(ws, compute_name, compute_config) compute_target.wait_for_completion(show_output=True) run_conf.target = compute_target run_conf.environment.docker.enabled = True run_conf.environment.docker.base_image = DEFAULT_CPU_IMAGE run_conf.environment.python.conda_dependencies = \ CondaDependencies(conda_dependencies_file_path='env.yml') run_conf.environment.python.user_managed_dependencies = False if cv: run_conf.communicator = 'OpenMPI' run_conf.mpi = MpiConfiguration() run_conf.node_count = cv + 2 exp = Experiment(workspace=ws, name=config['experiment_name']) use_estimator = True if use_estimator: if cv: script_params = {'--cv': cv} node_count = cv + 2 # dask-mpi uses 2 nodes for its scheduler and client distributed_training = MpiConfiguration() else: script_params = None node_count = None distributed_training = None to_run = Estimator(source_directory='.', compute_target=compute_target,
# azure ml settings experiment_name = "tensorflow-mnist-distributed" compute_name = "gpu-8x-a100" # environment env = Environment.get( workspace=ws, name="AzureML-tensorflow-2.4-ubuntu18.04-py37-cuda11-gpu").clone( "tensorflow-2.4-gpu") # Experiment configuration node_count = 2 # number of nodes process_count_per_node = 8 # number of GPUs per node # create distributed config distr_config = MpiConfiguration(process_count_per_node=process_count_per_node, node_count=node_count) # create arguments args = ["--epochs", 5] # create job config src = ScriptRunConfig( source_directory=source_dir, script=script_name, arguments=args, compute_target=compute_name, environment=env, distributed_job_config=distr_config, ) # submit job
# define script parameters script_params_3 = { '--models': models, '--data_folder_train': dataset_train.as_named_input('train').as_mount(), '--data_folder_test': dataset_test.as_named_input('test').as_mount(), '--local': 'no' } estimator = PyTorch( entry_script='train.py', script_params=script_params_3, source_directory=os.path.dirname(os.path.realpath(__file__)), compute_target=workspace.compute_targets["alwaysoncluster"], distributed_training=MpiConfiguration(), framework_version='1.4', use_gpu=True, pip_packages=[ 'numpy==1.15.4', 'pandas==0.23.4', 'scikit-learn==0.20.1', 'scipy==1.0.0', 'matplotlib==3.0.2', 'utils==0.9.0', 'onnxruntime==1.2.0', 'onnx==1.6.0' ]) experiment = Experiment(workspace=workspace, name="deeplearning") run = experiment.submit(estimator) if hyperdrive is True: # Define multi-run configuration hyperdrive_run_config = HyperDriveConfig( estimator=estimator,
async def __create_cluster(self): self.__print_message("Setting up cluster") run = None if self.parent_run: ## scheduler run as child run run_config = RunConfiguration() run_config.environment = self.environment_definition run_config.target = self.compute_target args = [] for key, value in self.scheduler_params.items(): args.append(f"{key}={value}") file_dataset_registered_name = self.kwargs.get( 'file_dataset_registered_name', None) dataset_config_name = self.kwargs.get('dataset_config_name', None) path_on_compute = self.kwargs.get('path_on_compute', None) if path_on_compute is not None: dataset = Dataset.get_by_name( workspace=self.workspace, name=file_dataset_registered_name) input1 = dataset.as_named_input(dataset_config_name).as_mount( path_on_compute=path_on_compute) args.append(input1) child_run_config = ScriptRunConfig( source_directory=os.path.join(self.abs_path, "setup"), script="start_scheduler.py", arguments=args, run_config=run_config, ) run = self.parent_run.submit_child(child_run_config, tags=self.tags) else: # submit scheduler run exp = Experiment(self.workspace, self.experiment_name) estimator = Estimator( os.path.join(self.abs_path, "setup"), compute_target=self.compute_target, entry_script="start_scheduler.py", environment_definition=self.environment_definition, script_params=self.scheduler_params, node_count=1, ### start only scheduler distributed_training=MpiConfiguration(), use_docker=True, inputs=self.datastores, ) run = exp.submit(estimator, tags=self.tags) self.__print_message("Waiting for scheduler node's IP") while (run.get_status() != "Canceled" and run.get_status() != "Failed" and "scheduler" not in run.get_metrics()): print(".", end="") logger.info("Scheduler not ready") time.sleep(5) if run.get_status() == "Canceled" or run.get_status() == "Failed": logger.exception("Failed to start the AzureML cluster") raise Exception("Failed to start the AzureML cluster.") self.run = run print("\n\n") ### SET FLAGS self.scheduler_ip_port = run.get_metrics()["scheduler"] self.worker_params["--scheduler_ip_port"] = self.scheduler_ip_port self.__print_message(f'Scheduler: {run.get_metrics()["scheduler"]}') logger.info(f'Scheduler: {run.get_metrics()["scheduler"]}') ### CHECK IF ON THE SAME VNET print("check if on the same vnet") while self.same_vnet is None: await self.sync(self.__check_if_scheduler_ip_reachable) time.sleep(1) ### REQUIRED BY dask.distributed.deploy.cluster.Cluster _scheduler = self.__prepare_rpc_connection_to_headnode() self.scheduler_comm = rpc(_scheduler) await self.sync(self.__setup_port_forwarding) await self.sync(super()._start) await self.sync(self.__update_links) self.__print_message("Connections established") self.__print_message(f"Scaling to {self.initial_node_count} workers") # LOGIC TO KEEP PROPER TRACK OF WORKERS IN `scale` if self.initial_node_count > 1: self.scale(self.initial_node_count) self.__print_message(f"Scaling is done")
def main(): parser = argparse.ArgumentParser( description="Run Elbencho on a BeeOND enabled cluster" ) parser.add_argument("num_nodes", type=int, help="Number of nodes") parser.add_argument("--follow", action="store_true", help="Follow run output") parser.add_argument( "--keep-cluster", action="store_true", help="Don't autoscale cluster down when idle (after run completed)", ) parser.add_argument( "--keep-failed-cluster", dest="terminate_on_failure", action="store_false" ) parser.add_argument("--sharedfiles", action="store_false", dest="multifile") args = parser.parse_args() workspace = get_or_create_workspace( sharedconfig.subscription_id, sharedconfig.resource_group_name, sharedconfig.workspace_name, sharedconfig.location, ) try: clusterconnector = create_or_update_cluster( workspace, sharedconfig.cluster_name, args.num_nodes, sharedconfig.ssh_key, sharedconfig.vm_type, terminate_on_failure=args.terminate_on_failure, use_beeond=True, ) except RuntimeError: cprint("Fatal Error - exiting", "red", attrs=["bold"]) sys.exit(-1) docker_args = [ "-v", "{}:{}".format(clusterconnector.beeond_mnt, sharedconfig.beeond_map), ] # Get and update the AzureML Environment object environment = create_or_update_environment( workspace, sharedconfig.environment_name, sharedconfig.docker_image, docker_args ) # Get/Create an experiment object experiment = Experiment(workspace=workspace, name=sharedconfig.experiment_name) # Configure the distributed compute settings parallelconfig = MpiConfiguration( node_count=args.num_nodes, process_count_per_node=1 ) if args.multifile: runscript = "./run_elbencho_multifile.sh" else: runscript = "./run_elbencho_largefile.sh" # Collect arguments to be passed to elbencho script script_args = [ "bash", runscript, sharedconfig.beeond_map, str(args.num_nodes), *clusterconnector.ibaddrs, ] # Define the configuration for running the training script script_conf = ScriptRunConfig( source_directory="scripts", command=script_args, compute_target=clusterconnector.cluster, environment=environment, distributed_job_config=parallelconfig, ) # We can use these tags make a note of run parameters (avoids grepping the logs) runtags = { "class": k_runclass, "vmtype": sharedconfig.vm_type, "num_nodes": args.num_nodes, "run_type": "multifile" if args.multifile else "sharedfile", } # Submit the run run = experiment.submit(config=script_conf, tags=runtags) # Can optionally choose to follow the output on the command line if args.follow: run.wait_for_completion(show_output=True)
# show up if you view the cl help args, args_to_pass_on = parser.parse_known_args() ws = Workspace.from_config() experiment = Experiment(workspace=ws, name=args.name) # Since for the training script directories are relative to the training script params_path = os.path.join("src", args.params_path) with open(params_path) as f: params = yaml.safe_load(f) params.update(vars(args)) node_count = params["num_gpus"] if params["use_hvd"] else 1 distr_config = MpiConfiguration(node_count=node_count) config = ScriptRunConfig( source_directory="./src", script="model/train.py", compute_target=args.compute_target, distributed_job_config=distr_config, arguments=["--params-path", args.params_path] + args_to_pass_on, ) config.run_config.environment = load_azml_env() run = experiment.submit(config) if args.run_label is not None: run.display_name = args.run_label
registry_address = acr.group(1) # onnxtraining.azurecr.io registry_name = acr.group(2) # onnxtraining container_image = acr.group(3) # azureml/bert:latest registry_client = get_client_from_cli_profile( ContainerRegistryManagementClient, subscription_id=args.subscription) registry_credentials = registry_client.registries.list_credentials( args.container_registry_resource_group, registry_name) registry_details = ContainerRegistry() registry_details.address = registry_address registry_details.username = registry_credentials.username registry_details.password = registry_credentials.passwords[0].value # MPI configuration if executing a distributed run mpi = MpiConfiguration() mpi.process_count_per_node = args.gpu_count # AzureML Estimator that describes how to run the Experiment estimator = Estimator(source_directory='./', script_params=script_params, compute_target=compute_target, node_count=args.node_count, distributed_training=mpi, image_registry_details=registry_details, use_docker=True, custom_docker_image=container_image, entry_script='train.py', inputs=[ds.path('./').as_mount()]) # Start the AzureML Experiment
ws = Workspace("48bbc269-ce89-4f6f-9a12-c6f91fcb772d", "aml1p-rg", "aml1p-ml-wus2") env = Environment.from_conda_specification("hydra-pl", "environment.yml") env.docker.enabled = True env.docker.base_image = ( "mcr.microsoft.com/azureml/openmpi3.1.2-cuda10.2-cudnn8-ubuntu18.04") # ============================================================================== node_count = 2 gpus_per_node = -1 cluster = "gpu-nc24-lowpri" # ============================================================================== mpi_config = MpiConfiguration(process_count_per_node=1, node_count=node_count) config = ScriptRunConfig( source_directory=".", script="train.py", compute_target=cluster, distributed_job_config=mpi_config, environment=env, arguments=[ f"trainer.gpus={gpus_per_node}", f"trainer.num_nodes={node_count}", "+trainer.accelerator=ddp", ], ) exp = Experiment(ws, "azuremlv2")
async def __create_cluster(self): self.__print_message("Setting up cluster") exp = Experiment(self.workspace, self.experiment_name) estimator = Estimator( os.path.join(self.abs_path, "setup"), compute_target=self.compute_target, entry_script="start_scheduler.py", environment_definition=self.environment_definition, script_params=self.scheduler_params, node_count=1, ### start only scheduler distributed_training=MpiConfiguration(), use_docker=True, inputs=self.datastores, ) run = exp.submit(estimator, tags=self.tags) self.__print_message("Waiting for scheduler node's IP") status = run.get_status() while ( status != "Canceled" and status != "Failed" and "scheduler" not in run.get_metrics() ): print(".", end="") logger.info("Scheduler not ready") time.sleep(5) status = run.get_status() if status == "Canceled" or status == "Failed": run_error = run.get_details().get("error") error_message = "Failed to start the AzureML cluster." if run_error: error_message = "{} {}".format(error_message, run_error) logger.exception(error_message) if not self.compute_target_set: self.__delete_compute_target() raise Exception(error_message) print("\n") ### SET FLAGS self.scheduler_ip_port = run.get_metrics()["scheduler"] self.worker_params["--scheduler_ip_port"] = self.scheduler_ip_port self.__print_message(f'Scheduler: {run.get_metrics()["scheduler"]}') self.run = run ### CHECK IF ON THE SAME VNET max_retry = 5 while self.same_vnet is None and max_retry > 0: time.sleep(5) await self.sync(self.__check_if_scheduler_ip_reachable) max_retry -= 1 if self.same_vnet is None: self.run.cancel() if not self.compute_target_set: self.__delete_compute_target() logger.exception( "Connection error after retrying. Failed to start the AzureML cluster." ) return ### REQUIRED BY dask.distributed.deploy.cluster.Cluster self.hostname = socket.gethostname() self.is_in_ci = ( f"/mnt/batch/tasks/shared/LS_root/mounts/clusters/{self.hostname}" in os.getcwd() ) _scheduler = self.__prepare_rpc_connection_to_headnode() self.scheduler_comm = rpc(_scheduler) await self.sync(self.__setup_port_forwarding) try: await super()._start() except Exception as e: logger.exception(e) # CLEAN UP COMPUTE TARGET self.run.cancel() if not self.compute_target_set: self.__delete_compute_target() return await self.sync(self.__update_links) self.__print_message("Connections established") self.__print_message(f"Scaling to {self.initial_node_count} workers") if self.initial_node_count > 1: self.scale( self.initial_node_count ) # LOGIC TO KEEP PROPER TRACK OF WORKERS IN `scale` self.__print_message("Scaling is done")
def get_distributed_job_config(args: JobArguments): n_proc = TARGET_GPU_COUNT[args.target_name] distributed_job_config = MpiConfiguration(process_count_per_node=n_proc, node_count=args.node_count) return distributed_job_config
env.python.user_managed_dependencies = True env.python.interpreter_path = "/opt/miniconda/bin/python" # To install any Python packages you need, simply add RUN pip install package-name to the docker string. E.g. `RUN pip install sklearn` # Specify docker steps as a string and use the base DeepSpeed Docker image dockerfile = r""" FROM deepspeed/base-aml:with-pt-ds-and-deps RUN pip install azureml-mlflow RUN echo "Welcome to the DeepSpeed custom environment!" """ # set base image to None, because the image is defined by dockerfile. env.docker.base_image = None env.docker.base_dockerfile = dockerfile # create job config mpi_config = MpiConfiguration(node_count=2, process_count_per_node=2) src = ScriptRunConfig( source_directory=script_dir, script=script_name, arguments=arguments, environment=env, compute_target=compute_name, distributed_job_config=mpi_config, ) # submit job run = Experiment(ws, experiment_name).submit(src) run.wait_for_completion(show_output=True)
parser.add_argument("-g", "--num-gpus", default=20) args = parser.parse_args() ws = Workspace.from_config() experiment = Experiment(workspace=ws, name=args.experiment) remote_model_file = ( os.path.join("outputs", args.model_file) if not args.model_file.startswith("outputs") else args.model_file ) model_file = f"data/azml/{args.run_id}_{args.model_file}.pt" if not os.path.exists(model_file): download_model_file(args.run_id, remote_model_file, model_file) distr_config = MpiConfiguration(node_count=args.num_gpus) config = ScriptRunConfig( source_directory="./src", script="model/predict_hvd.py", compute_target=args.compute_target, distributed_job_config=distr_config, arguments=[ "--aoi", args.aoi_file, "--feature-file", args.feature_file, "--model-file", os.path.basename(model_file), ], max_run_duration_seconds=60 * 30, environment=load_azml_env(),