def create_and_submit_experiment( azure_config: AzureConfig, source_config: SourceConfig, model_config_overrides: str, azure_dataset_id: str) -> Run: """ Creates an AzureML experiment in the workspace and submits it for execution. :param azure_config: azure related configurations to setup valid workspace :param source_config: The information about which code should be submitted, and which arguments should be used. :param model_config_overrides: A string that describes which model parameters were overwritten by commandline arguments in the present run. This is only used for diagnostic purposes (it is set as a Tag on the run). :param azure_dataset_id: The name of the dataset in blob storage to be used for this run. :returns: Run object for the submitted AzureML run """ workspace = azure_config.get_workspace() experiment_name = create_experiment_name(azure_config) exp = Experiment(workspace=workspace, name=azure_util.to_azure_friendly_string(experiment_name)) script_run_config = create_run_config(azure_config, source_config, azure_dataset_id) # submit a training/testing run associated with the experiment run: Run = exp.submit(script_run_config) # set metadata for the run set_run_tags(run, azure_config, model_config_overrides) print("\n==============================================================================") print(f"Successfully queued new run {run.id} in experiment: {exp.name}") if azure_config.run_recovery_id: print(f"\nRecovered from: {azure_config.run_recovery_id}") recovery_id = azure_util.create_run_recovery_id(run) recovery_file = Path(RUN_RECOVERY_FILE) if recovery_file.exists(): recovery_file.unlink() recovery_file.write_text(recovery_id) print("Experiment URL: {}".format(exp.get_portal_url())) print("Run URL: {}".format(run.get_portal_url())) print("If this run fails, re-start runner.py and supply these additional arguments: " f"--run_recovery_id={recovery_id}") print(f"The run recovery ID has been written to this file: {recovery_file}") print("==============================================================================") if azure_config.tensorboard and azure_config.azureml: print("Starting TensorBoard now because you specified --tensorboard") monitor(monitor_config=AMLTensorBoardMonitorConfig(run_ids=[run.id]), azure_config=azure_config) else: print(f"To monitor this run locally using TensorBoard, run the script: " f"InnerEye/Azure/tensorboard_monitor.py --run_ids={run.id}") print("==============================================================================") return run
def submit_azureml_run(args: JobArguments): """Submit GLUE experiment to azureml.""" ws = Workspace.from_config() print("ws: ", ws) # get root of git repo prefix = Path(__file__).parent source_directory = str(prefix.joinpath("src")) target = ws.compute_targets[args.target_name] env = get_azureml_environment() distributed_job_config = get_distributed_job_config(args) cmd = f"""ds_report && python finetune_glue.py --output_dir outputs --model_checkpoint {args.model_checkpoint} --task {args.task} --num_train_epochs {args.num_train_epochs} --per_device_train_batch_size {args.per_device_train_batch_size} --per_device_eval_batch_size {args.per_device_eval_batch_size} --disable_tqdm 1 --local_rank $OMPI_COMM_WORLD_LOCAL_RANK --deepspeed ds_config.json """.split() config = ScriptRunConfig( source_directory=source_directory, command=cmd, environment=env, compute_target=target, distributed_job_config=distributed_job_config, ) run = Experiment(ws, "deepspeed-transformers-example").submit(config) print(run.get_portal_url()) # link to ml.azure.com run.set_tags(asdict(args))
hyperdrive.loguniform( convert_base(1e-6), convert_base(5e-2)), # NB. loguniform on [exp(min), exp(max)] "--weight_decay": hyperdrive.uniform(5e-3, 15e-2), "--per_device_train_batch_size": hyperdrive.choice([16, 32]), } hyperparameter_sampling = RandomParameterSampling(search_space) policy = TruncationSelectionPolicy(truncation_percentage=50, evaluation_interval=2, delay_evaluation=0) hyperdrive_config = HyperDriveConfig( run_config=config, hyperparameter_sampling=hyperparameter_sampling, policy=policy, primary_metric_name="eval_matthews_correlation", primary_metric_goal=hyperdrive.PrimaryMetricGoal.MAXIMIZE, max_total_runs=20, max_concurrent_runs=8, ) run = Experiment( ws, "transformers-glue-finetuning-hyperdrive").submit(hyperdrive_config) print(run.get_portal_url()) run.wait_for_completion(show_output=True)
) #### SET PROPER INTERPRETER estimator._estimator_config.environment.python.interpreter_path = '/opt/conda/envs/rapids/bin/python' print_message("STARTING EXPERIMENT") experiment = Experiment( workspace , args.experiment_name ).submit(estimator) print() print_message("WAITING FOR THE HEADNODE") print_message("NOTE: THIS MAY TAKE SEVERAL MINUTES", filler='!') print_message(f"TRACK PROGRESS HERE --->>> ", filler='%') print_message(experiment.get_portal_url(), filler='%') print() print_message("SPINNING UP THE DASK CLUSTER") rep = 0 done = False prev_status = "" spinning_thread = threading.Thread(target=spinner) spinning_thread.start() start_time = time.time() timeout_sec = args.timeout_minutes * 60 while not "headnode" in experiment.get_metrics(): rep += 1 time.sleep(5)
"--datastore": workspace.get_default_datastore(), "--n_gpus_per_node": str(n_gpus_per_node), "--jupyter_token": str(args.jupyter_token) }, distributed_training=Mpi(process_count_per_node=1), node_count=int(args.node_count), use_gpu=True, conda_dependencies_file='rapids-0.10.yml') print("Starting experiment run ...") experiment = Experiment(workspace, args.experiment_name).submit(estimator) print(" ... waiting for headnode ...") print(" ... this may take several minutes ...") print("(For updated results, see: ", experiment.get_portal_url(), ")") rep = 0 done = False prev_status = "" spinning_thread = threading.Thread(target=spinner) spinning_thread.start() start_time = time.time() timeout_sec = args.timeout_minutes * 60 while not "headnode" in experiment.get_metrics(): rep += 1 time.sleep(5) status = experiment.get_status() if status != prev_status: print("Status now: ", status) prev_status = status
def create_and_submit_experiment(azure_config: AzureConfig, script_run_config: ScriptRunConfig, commandline_args: str) -> Run: """ Creates an AzureML experiment in the workspace and submits it for execution. :param azure_config: azure related configurations to setup a valid workspace. :param script_run_config: The configuration for the script that should be run inside of AzureML. :param commandline_args: A string with all commandline arguments that were provided to the runner. These are only used to set a tag on the submitted AzureML run. :returns: Run object for the submitted AzureML run """ workspace = azure_config.get_workspace() experiment_name = create_experiment_name(azure_config) exp = Experiment(workspace=workspace, name=azure_util.to_azure_friendly_string(experiment_name)) # submit a training/testing run associated with the experiment run: Run = exp.submit(script_run_config) if is_offline_run_context(run): # This codepath will only be executed in unit tests, when exp.submit is mocked. return run # Set metadata for the run. set_run_tags(run, azure_config, commandline_args=commandline_args) print( "\n==============================================================================" ) print(f"Successfully queued new run {run.id} in experiment: {exp.name}") if azure_config.run_recovery_id: print(f"\nRecovered from: {azure_config.run_recovery_id}") recovery_id = azure_util.create_run_recovery_id(run) recovery_file = Path(RUN_RECOVERY_FILE) if recovery_file.exists(): recovery_file.unlink() recovery_file.write_text(recovery_id) print("Experiment URL: {}".format(exp.get_portal_url())) print("Run URL: {}".format(run.get_portal_url())) print( "If this run fails, re-start runner.py and supply these additional arguments: " f"--run_recovery_id={recovery_id}") print( f"The run recovery ID has been written to this file: {recovery_file}") print( "==============================================================================" ) if azure_config.tensorboard and azure_config.azureml: print("Starting TensorBoard now because you specified --tensorboard") monitor(monitor_config=AMLTensorBoardMonitorConfig(run_ids=[run.id]), azure_config=azure_config) else: print( f"To monitor this run locally using TensorBoard, run the script: " f"InnerEye/Azure/tensorboard_monitor.py --run_ids={run.id}") print( "==============================================================================" ) return run
# start the MLflow experiment with mlflow.start_run(): print("Starting experiment:", experiment.name) # Load data data = pd.read_csv('data/diabetes.csv') # Count the rows and log the result row_count = (len(data)) print('observations:', row_count) mlflow.log_metric('observations', row_count) # Get a link to the experiment in Azure ML studio experiment_url = experiment.get_portal_url() print('See details at', experiment_url) import os, shutil # Create a folder for the experiment files folder_name = 'mlflow-experiment-files' experiment_folder = './' + folder_name os.makedirs(folder_name, exist_ok=True) # Copy the data file into the experiment folder shutil.copy('data/diabetes.csv', os.path.join(folder_name, "diabetes.csv"))
# MAGIC We can: # MAGIC 1. Use the Azure Portal to compare runs # MAGIC 1. Use Python to compare runs # COMMAND ---------- # MAGIC %md # MAGIC #### 1. Azure Portal # MAGIC The `Experiment` object has the `get_portal_url()` method that will auto populate the URL. # MAGIC # MAGIC We can use Databricks' `displayHTML` function to render a hyperlink. # COMMAND ---------- # To find the best performing model, we have several options - we can retrieve the metrics from within Python or we review the Azure portal displayHTML('<a href="{url}" target="_blank">{url}</a>'.format(url=experiment.get_portal_url())) # COMMAND ---------- # MAGIC %md # MAGIC #### 2. Python to Compare Runs # MAGIC # MAGIC Each `Run` object has a `get_metrics()` method that will retrieve our stored metrics. We can leverage the `get_runs()` method of the `Experiment` object to retrieve the run objects. # MAGIC # MAGIC We will then render a table to compare model performance. # COMMAND ---------- # Download RMSE and R2 from AML Service import pandas as pd