def submit_for_inference(args: SubmitForInferenceConfig, azure_config: AzureConfig) -> Optional[Path]: """ Create and submit an inference to AzureML, and optionally download the resulting segmentation. :param azure_config: An object with all necessary information for accessing Azure. :param args: configuration, see SubmitForInferenceConfig :return: path to downloaded segmentation on local disc, or None if none. """ logging.info(f"Building Azure configuration from {args.settings}") logging.info("Getting workspace") workspace = azure_config.get_workspace() logging.info("Identifying model") model = Model(workspace=workspace, id=args.model_id) model_id = model.id logging.info(f"Identified model {model_id}") source_directory = tempfile.TemporaryDirectory() source_directory_path = Path(source_directory.name) logging.info( f"Building inference run submission in {source_directory_path}") image_folder = source_directory_path / fixed_paths.DEFAULT_DATA_FOLDER image = copy_image_file(args.image_file, image_folder) model_sas_urls = model.get_sas_urls() # Identifies all the files with basename "environment.yml" in the model and downloads them. # These downloads should go into a temp folder that will most likely not be included in the model itself, # because the AzureML run will later download the model into the same folder structure, and the file names might # clash. temp_folder = source_directory_path / "temp_for_scoring" conda_files = download_files_from_model(model_sas_urls, ENVIRONMENT_YAML_FILE_NAME, dir_path=temp_folder) if not conda_files: raise ValueError( "At least 1 Conda environment definition must exist in the model.") # Copy the scoring script from the repository. This will start the model download from Azure, and invoke the # scoring script. entry_script = source_directory_path / Path( fixed_paths.RUN_SCORING_SCRIPT).name shutil.copyfile( str( fixed_paths.repository_root_directory( fixed_paths.RUN_SCORING_SCRIPT)), str(entry_script)) source_config = SourceConfig( root_folder=source_directory_path, entry_script=entry_script, script_params={ "--model-folder": ".", "--model-id": model_id, fixed_paths.SCORE_SCRIPT: "", # The data folder must be relative to the root folder of the AzureML job. test_image_files # is then just the file relative to the data_folder "--data_folder": image.parent.name, "--image_files": image.name }, conda_dependencies_files=conda_files, ) estimator = create_estimator_from_configs(azure_config, source_config, []) exp = Experiment(workspace=workspace, name=args.experiment_name) run = exp.submit(estimator) logging.info(f"Submitted run {run.id} in experiment {run.experiment.name}") logging.info(f"Run URL: {run.get_portal_url()}") if not args.keep_upload_folder: source_directory.cleanup() logging.info(f"Deleted submission directory {source_directory_path}") if args.download_folder is None: return None logging.info("Awaiting run completion") run.wait_for_completion() logging.info(f"Run has completed with status {run.get_status()}") download_path = choose_download_path(args.download_folder) logging.info(f"Attempting to download segmentation to {download_path}") run.download_file(DEFAULT_RESULT_IMAGE_NAME, str(download_path)) if download_path.exists(): logging.info(f"Downloaded segmentation to {download_path}") else: logging.warning("Segmentation NOT downloaded") return download_path
#Build Pipeline pipeline = Pipeline(workspace=ws, steps=[preprocessing_step, est_step, register_step]) #Validate pipeline pipeline.validate() print("Pipeline validation complete") #Publish the pipeline published_pipeline = pipeline.publish( name="MLOps_Pipeline_Estimator", description="MLOps pipeline for estimator", continue_on_step_failure=True) #submit Pipeline pipeline_run = exp.submit(pipeline, pipeline_parameters={}) print("Pipeline is submitted for execution") ####################################################################################################### # Shows output of the run on stdout. pipeline_run.wait_for_completion(show_output=True) # Raise exception if run fails if pipeline_run.get_status() == "Failed": raise Exception( "Training on local failed with following run status: {} and logs: \n {}" .format(pipeline_run.get_status(), pipeline_run.get_details_with_logs())) # Writing the run id to /aml_config/run_id.json '''
def launch_experiment(ws, conf_aml, conf_cluster, conf_docker, conf_experiment): # Register the input data blob container input_ds = Datastore.register_azure_blob_container( workspace=ws, datastore_name='petridishdata', container_name='datasets', account_name='petridishdata', account_key=conf_aml['azure_storage_account_key'], create_if_not_exists=False) output_ds = Datastore.register_azure_blob_container( workspace=ws, datastore_name='petridishoutput', container_name='amloutput', account_name='petridishdata', account_key=conf_aml['azure_storage_account_key'], create_if_not_exists=False) # Create or attach compute cluster # cluster_name = conf_cluster['cluster_name'] + datetime.datetime.now().strftime('%Y%m%d%I%M') cluster_name = conf_cluster['cluster_name'] try: compute_target = ComputeTarget(workspace=ws, name=cluster_name) print('Found existing compute target.') except: print('Creating a new compute target...') compute_config = AmlCompute.provisioning_configuration( vm_size=conf_cluster['vm_size'], max_nodes=conf_cluster['max_nodes'], vm_priority=conf_cluster['vm_priority'], idle_seconds_before_scaledown=conf_cluster[ 'idle_seconds_before_scaledown']) # Create the cluster compute_target = ComputeTarget.create(ws, cluster_name, compute_config) compute_target.wait_for_completion(show_output=True) # use get_status() to get a detailed status for the current cluster. print(compute_target.get_status().serialize()) # Set project directory # Assuming running in extract_features_from_videos folder project_folder = '../' # Setup custom docker usage image_registry_details = ContainerRegistry() image_registry_details.address = conf_docker['image_registry_address'] image_registry_details.username = conf_docker['image_registry_username'] image_registry_details.password = conf_docker['image_registry_password'] # don't let the system build a new conda environment user_managed_dependencies = True # Note that experiment names have to be # <36 alphanumeric characters exp_name = conf_experiment['experiment_name'] experiment = Experiment(ws, name=exp_name) # TODO: Make config for i in tqdm(range(200)): log_dir = exp_name + f'_{i}' script_params = { '--nas.eval.loader.dataset.dataroot': input_ds.path('/').as_mount(), '--common.logdir': output_ds.path('/{}'.format(log_dir)).as_mount(), } est = Estimator(source_directory=project_folder, script_params=script_params, compute_target=compute_target, entry_script='scripts/random/cifar_eval.py', custom_docker_image=conf_docker['image_name'], image_registry_details=image_registry_details, user_managed=user_managed_dependencies, source_directory_data_store=input_ds) run = experiment.submit(est)
from azureml.core import Workspace from azureml.core import Experiment from azureml.core import ScriptRunConfig import os, json # Workspaceの取得 ws = Workspace.from_config() # Experimentの設定 experiment_name = 'devops0201' exp = Experiment(workspace = ws, name = experiment_name) print(exp.name, exp.workspace.name, sep = '\n') # 実行構成 run_config_user_managed = RunConfiguration() run_config_user_managed.environment.python.user_managed_dependencies = True # モデル学習コードの指定 src = ScriptRunConfig(source_directory = './code', script = 'training/train.py', run_config = run_config_user_managed) print("モデル学習の実施") run = exp.submit(src) # 出力 run.wait_for_completion(show_output = True) run_id={} run_id['run_id'] = run.id run_id['experiment_name'] = run.experiment.name with open('aml_config/run_id.json', 'w') as outfile: json.dump(run_id,outfile)
dataset_version = 1 arguments = [ "--remote_debug", "--remote_debug_connection_timeout", 300, "--remote_debug_client_ip", ip, "--remote_debug_port", 5678, "--version", dataset_version, ] env = Environment.get(workspace=ws, name="ds_envs") src = ScriptRunConfig( source_directory=get_project_root() / "ds_envs" / "cloud", script="train.py", arguments=arguments, compute_target="local", environment=env, ) experiment_name = "my_experiment" experiment = Experiment(workspace=ws, name=experiment_name) run = experiment.submit(config=src) run.wait_for_completion(show_output=True)
source_directory=project_folder, allow_reuse=True, runconfig=amlcompute_run_config) print("AutoML Training Step created.") steps = [anom_detect, automl_step] print("Step lists created") pipeline = Pipeline(workspace=ws, steps=steps) print("Pipeline is built") pipeline.validate() print("Pipeline validation complete") pipeline_run = experiment.submit(pipeline) #, regenerate_outputs=True) print("Pipeline is submitted for execution") # Wait until the run finishes. pipeline_run.wait_for_completion(show_output=False) print("Pipeline run completed") # Download aml_config info and output of automl_step def_data_store.download(target_path='.', prefix='aml_config', show_progress=True, overwrite=True) def_data_store.download(target_path='.', prefix='outputs', show_progress=True,
async def __create_cluster(self): # set up environment self.__print_message("Setting up cluster") # submit run self.__print_message("Submitting the experiment") exp = Experiment(self.workspace, self.experiment_name) estimator = Estimator( os.path.join(self.abs_path, "setup"), compute_target=self.compute_target, entry_script="start_scheduler.py", environment_definition=self.environment_definition, script_params=self.scheduler_params, node_count=1, ### start only scheduler distributed_training=MpiConfiguration(), use_docker=True, inputs=self.datastores, ) run = exp.submit(estimator) self.__print_message("Waiting for scheduler node's IP") while ( run.get_status() != "Canceled" and run.get_status() != "Failed" and "scheduler" not in run.get_metrics() ): print(".", end="") logger.info("Scheduler not ready") time.sleep(5) if run.get_status() == "Canceled" or run.get_status() == "Failed": logger.exception("Failed to start the AzureML cluster") raise Exception("Failed to start the AzureML cluster.") print("\n\n") ### SET FLAGS self.scheduler_ip_port = run.get_metrics()["scheduler"] self.worker_params["--scheduler_ip_port"] = self.scheduler_ip_port self.__print_message(f'Scheduler: {run.get_metrics()["scheduler"]}') self.run = run logger.info(f'Scheduler: {run.get_metrics()["scheduler"]}') ### CHECK IF ON THE SAME VNET while self.same_vnet is None: await self.sync(self.__check_if_scheduler_ip_reachable) time.sleep(1) ### REQUIRED BY dask.distributed.deploy.cluster.Cluster _scheduler = self.__prepare_rpc_connection_to_headnode() self.scheduler_comm = rpc(_scheduler) await self.sync(self.__setup_port_forwarding) await self.sync(super()._start) await self.sync(self.__update_links) self.__print_message("Connections established") self.__print_message(f"Scaling to {self.initial_node_count} workers") if self.initial_node_count > 1: self.scale( self.initial_node_count ) # LOGIC TO KEEP PROPER TRACK OF WORKERS IN `scale` self.__print_message(f"Scaling is done")
'--lr_decay': loguniform(-9, -1) }) policy = BanditPolicy(evaluation_interval=2, slack_factor=0.1) #, delay_evaluation=20) hdc = HyperDriveRunConfig(estimator=est, hyperparameter_sampling=ps, policy=policy, primary_metric_name='val_loss', primary_metric_goal=PrimaryMetricGoal.MINIMIZE, max_total_runs=5, max_concurrent_runs=50) hdr = exp.submit(config=hdc) hdr.wait_for_completion(show_output=True) best_run = hdr.get_best_run_by_primary_metric() best_run_metrics = best_run.get_metrics() print(best_run) # Writing the run id to /aml_config/run_id.json for use by a DevOps pipeline. run_id = {} run_id['run_id'] = best_run.id run_id['experiment_name'] = best_run.experiment.name # save run info os.makedirs('aml_config', exist_ok=True) with open('aml_config/run_id.json', 'w') as outfile:
# %% from azureml.train.dnn import PyTorch estimator = PyTorch(source_directory=project_folder, script_params={'--output-dir': './outputs'}, compute_target=compute_target, entry_script='mnist.py', use_gpu=False) estimator.conda_dependencies.remove_conda_package('pytorch=0.4.0') estimator.conda_dependencies.add_conda_package('pytorch-nightly') estimator.conda_dependencies.add_channel('pytorch') # %% run = exp.submit(estimator) run.wait_for_completion(show_output=True) # %% run.get_file_names() model_path = os.path.join('outputs', 'mnist.onnx') run.download_file(model_path, output_file_path=model_path) # %% model = run.register_model(model_name='mnist', model_path=model_path) print(model.name, model.id, model.version, sep='\t') # %% models = ws.models for name, m in models.items(): print("Name:", name, "\tVersion:", m.version, "\tDescription:",
def main(): """ Run the experiment for training """ work_space = Workspace.from_config() # Set up the dataset for training datastore = work_space.get_default_datastore() dataset = Dataset.File.from_files(path=(datastore, "datasets/mnist")) # Set up the experiment for training experiment = Experiment(workspace=work_space, name="keras-lenet-train") # azureml._restclient.snapshots_client.SNAPSHOT_MAX_SIZE_BYTES = 2000000000 config = ScriptRunConfig( source_directory=".", script="train_keras.py", compute_target="cpu-cluster", arguments=[ "--data_folder", dataset.as_named_input("input").as_mount(), ], ) # Set up the Tensoflow/Keras environment environment = Environment("keras-environment") environment.python.conda_dependencies = CondaDependencies.create( python_version="3.7.7", pip_packages=["azureml-defaults", "numpy", "tensorflow==2.3.1"]) config.run_config.environment = environment # Run the experiment for training run = experiment.submit(config) aml_url = run.get_portal_url() print( "Submitted to an Azure Machine Learning compute cluster. Click on the link below" ) print("") print(aml_url) tboard = Tensorboard([run]) # If successful, start() returns a string with the URI of the instance. tboard.start(start_browser=True) run.wait_for_completion(show_output=True) # After your job completes, be sure to stop() the streaming otherwise it will continue to run. print("Press enter to stop") input() tboard.stop() # Register Model metrics = run.get_metrics() run.register_model( model_name="keras_mnist", tags={ "data": "mnist", "model": "classification" }, model_path="outputs/keras_lenet.h5", model_framework=Model.Framework.TENSORFLOW, model_framework_version="2.3.1", properties={ "train_loss": metrics["train_loss"][-1], "train_accuracy": metrics["train_accuracy"][-1], "val_loss": metrics["val_loss"][-1], "val_accuracy": metrics["val_accuracy"][-1], }, )
runConfig.environment.python.interpreter_path = os.environ[ 'VIRTUAL_ENV'] + "/bin/python" print( f"### Will execute script {trainingScriptDir}/{trainingScript} on LOCAL compute" ) # Pass two args to the training script scriptArgs = [ "--data-path", "/tmp/" + dataPathRemote, "--estimators", estimators ] scriptRunConf = ScriptRunConfig(source_directory=trainingScriptDir, script=trainingScript, arguments=scriptArgs, run_config=runConfig) run = exp.submit(scriptRunConf) print(f"### Run '{run.id}' submitted and started...") run.wait_for_completion(show_output=True, wait_post_processing=True) # ===== Training Complete ===== if run.status == "Failed": print(f'### ERROR! Run did not complete. Training failed!') exit(1) accuracy = run.get_metrics()['accuracy'] or 0.0 model = run.register_model( # NOTE! Must be called 'outputs' this is expected by training scripts and fetch model process model_path='outputs/', model_name=os.environ['AZML_MODEL'],
from azureml.core import Workspace, Experiment, ScriptRunConfig ws = Workspace.from_config() compute_target = ws.compute_targets['V100-4'] # compute_target = ws.compute_targets['K80'] command = [ "pip install torch transformers datasets flaml[blendsearch,ray] && ", "python test_electra.py" ] config = ScriptRunConfig( source_directory='hf/', command=command, compute_target=compute_target, ) exp = Experiment(ws, 'test-electra') run = exp.submit(config) print(run.get_portal_url()) # link to ml.azure.com run.wait_for_completion(show_output=True)
def main(): parser = argparse.ArgumentParser( description="Run Elbencho on a BeeOND enabled cluster" ) parser.add_argument("num_nodes", type=int, help="Number of nodes") parser.add_argument("--follow", action="store_true", help="Follow run output") parser.add_argument( "--keep-cluster", action="store_true", help="Don't autoscale cluster down when idle (after run completed)", ) parser.add_argument( "--keep-failed-cluster", dest="terminate_on_failure", action="store_false" ) parser.add_argument("--sharedfiles", action="store_false", dest="multifile") args = parser.parse_args() workspace = get_or_create_workspace( sharedconfig.subscription_id, sharedconfig.resource_group_name, sharedconfig.workspace_name, sharedconfig.location, ) try: clusterconnector = create_or_update_cluster( workspace, sharedconfig.cluster_name, args.num_nodes, sharedconfig.ssh_key, sharedconfig.vm_type, terminate_on_failure=args.terminate_on_failure, use_beeond=True, ) except RuntimeError: cprint("Fatal Error - exiting", "red", attrs=["bold"]) sys.exit(-1) docker_args = [ "-v", "{}:{}".format(clusterconnector.beeond_mnt, sharedconfig.beeond_map), ] # Get and update the AzureML Environment object environment = create_or_update_environment( workspace, sharedconfig.environment_name, sharedconfig.docker_image, docker_args ) # Get/Create an experiment object experiment = Experiment(workspace=workspace, name=sharedconfig.experiment_name) # Configure the distributed compute settings parallelconfig = MpiConfiguration( node_count=args.num_nodes, process_count_per_node=1 ) if args.multifile: runscript = "./run_elbencho_multifile.sh" else: runscript = "./run_elbencho_largefile.sh" # Collect arguments to be passed to elbencho script script_args = [ "bash", runscript, sharedconfig.beeond_map, str(args.num_nodes), *clusterconnector.ibaddrs, ] # Define the configuration for running the training script script_conf = ScriptRunConfig( source_directory="scripts", command=script_args, compute_target=clusterconnector.cluster, environment=environment, distributed_job_config=parallelconfig, ) # We can use these tags make a note of run parameters (avoids grepping the logs) runtags = { "class": k_runclass, "vmtype": sharedconfig.vm_type, "num_nodes": args.num_nodes, "run_type": "multifile" if args.multifile else "sharedfile", } # Submit the run run = experiment.submit(config=script_conf, tags=runtags) # Can optionally choose to follow the output on the command line if args.follow: run.wait_for_completion(show_output=True)
class AzureMLTrainer(trainer.Trainer): is_connected: bool = False __config_file: str = '.azureml/config.json' __workspace: Workspace = None __experiment: Experiment = None __current_experiment_name: str __current_run: Run = None __logger: Logger = None __vm_size_list: list = None def __init__(self, experiment_name: str, aml_workspace: Workspace, aml_run: Run = None): ''' Initializes a new connected Trainer that will persist and log all runs on AzureML workspace Args: experiment_name (str): The name of the experiment that will be seen on AzureML aml_workspace (Workspace): The connected workspace on AzureML ''' self.__workspace = aml_workspace self.__logger = logging.getLogger() if aml_run is not None: self.__current_run = aml_run self.__experiment = aml_run.experiment self.__current_experiment_name = aml_run.experiment.name else: self.__current_experiment_name = experiment_name self.__experiment = Experiment(workspace=self.__workspace, name=experiment_name) @classmethod def CreateFromContext(cls): ''' Creates a Trainer, based on the current Run context. This will only work when used in an Estimator Returns: AzureMLTrainer: an instance of AzureMLTrainer allowing the user to work connected. ''' run = Run.get_context() return cls(run.experiment.name, run.experiment.workspace, run) def new_run(self, description: str = None, copy_folder: bool = True, metrics: dict = None) -> Run: ''' This will begin a new interactive run on the existing AzureML Experiment. When a previous run was still active, it will be completed. Args: description (str): An optional description that will be added to the run metadata copy_folder (bool): Indicates if the output folder should be snapshotted and persisted metrics (dict): The metrics that should be logged in the run already Returns: Run: the AzureML Run object that can be used for further access and custom logic ''' if(self.__current_run is not None): self.__current_run.complete() if(copy_folder): self.__current_run = self.__experiment.start_logging() else: self.__current_run = self.__experiment.start_logging(snapshot_directory = None) if(metrics is not None): for k, v in metrics.items(): self.__current_run.log(k, v) if(description is not None): self.__current_run.log('Description', description) return self.__current_run def add_tuning_result(self, run_index: int, train_score: float, test_score: float, sample_count: int, durations:np.array, parameters: dict, estimator): ''' This add results of a cross validation fold to the child run in a Grid Search Args: train_score (float): The given score of the training data test_score (float): The given score of the test data sample_count (int): The number of samples that were part of a fold durations (np.array): The different durations of the Grid Search parameters (dict): The parameter combinations that have been tested in this cross validation fold estimate (model): The actual fitted estimator / model that was trained in this fold ''' _child_run = self.__current_run.child_run('Gridsearch' + str(run_index)) self.__current_run.log_row('Trainscore', score = train_score) self.__current_run.log_row('Testscore', score = test_score) _table = { 'Testing score': test_score, 'Training score': train_score } for k in parameters.keys(): v = parameters[k] if(v is None): v = 'None' _child_run.log(k, v) _table[k] = v self.__current_run.log_row('Results', '', **_table) _child_run.complete() def get_best_model(self, metric_name:str, take_highest:bool = True): ''' Tags and returns the best model of the experiment, based on the given metric Args: metric_name (str): The name of the metric, such as accuracy take_highest (bool): In case of accuracy and score, this is typically True. In case you want to get the model based on the lowest error, you can use False Returns: Run: the best run, which will be labeled as best run ''' runs = {} run_metrics = {} for r in tqdm(self.__experiment.get_runs()): metrics = r.get_metrics() if metric_name in metrics.keys(): runs[r.id] = r run_metrics[r.id] = metrics best_run_id = min(run_metrics, key = lambda k: run_metrics[k][metric_name]) best_run = runs[best_run_id] best_run.tag('Best run') return best_run def get_azureml_experiment(self): ''' Gives access to the AzureML experiment object Returns: Experiment: the existing experiment ''' return self.__experiment def complete_run(self, fitted_model, metrics_to_log: dict = None, upload_model: bool = True): ''' Saves all results of the active Run and completes it Args: fitted_model (model): The already fitted model to be tested. Sklearn and Keras models have been tested metrics_to_log (dict): The metrics that should be logged with the model to the run upload_model (bool): This will upload the model (pkl file or json) to AzureML run (defaults to True) ''' is_keras = 'keras' in str(type(fitted_model)) if(metrics_to_log is not None): for k, v in metrics_to_log.items(): self._log_metrics(k, v) if upload_model: # Save the model to the outputs directory for capture if(is_keras): model_folder_name = 'outputs/model' fitted_model.save(model_folder_name) files_to_upload = dict() else: model_file_name = 'outputs/model.pkl' joblib.dump(value = fitted_model, filename = model_file_name) self._complete_run() def evaluate_classifier(self, fitted_model, X_test: np.array, y_test: np.array, show_roc: bool = False, save_curves_as_image: bool = False, class_names: np.array = None, finish_existing_run: bool = True, upload_model: bool = True, return_predictions: bool = False) -> np.array: ''' Will predict and evaluate a model against a test set and save all results to the active Run on AzureML Args: fitted_model (model): The already fitted model to be tested. Sklearn and Keras models have been tested X_test (np.array): The test set to calculate the predictions with y_test (np.array): The output test set to evaluate the predictions against show_roc (bool): This will upload the ROC curve to the run in case of a binary classifier save_curves_as_image (bool): This will save the training & loss curves as images class_names (np.array): The class names that will be linked to the Confusion Matrix. If not provided, the unique values of the y_test matrix will be used finish_existing_run (bool): Will complete the existing run on AzureML (defaults to True) upload_model (bool): This will upload the model (pkl file) to AzureML run (defaults to True) return_predictions (bool): If true, the y_pred values will be returned Returns: np.array: The predicted (y_pred) values against the model ''' is_keras = 'keras' in str(type(fitted_model)) # Predict X_test with model if(is_keras): if 'predict_classes' in dir(fitted_model): y_pred = fitted_model.predict_classes(X_test) else: y_pred = fitted_model.predict(X_test) y_pred = np.argmax(y_pred, axis=1) self.add_training_plots(fitted_model, save_image=save_curves_as_image) else: y_pred = fitted_model.predict(X_test) if class_names is None: class_names = np.char.mod('%d', sorted(np.unique(y_test))) # Print classification report print(metrics.classification_report(y_test, y_pred)) # Confusion matrix cf = metrics.confusion_matrix(y_test, y_pred) self._log_confmatrix(cf, class_names) # Accuracy accuracy = metrics.accuracy_score(y_test, y_pred) * 100 self._log_metrics('accuracy', accuracy, description='') if(show_roc == True): # Verify that we are having a binary classifier if(len(class_names)!=2): raise AttributeError('Showing a ROC curve is only possible for binary classifier, not for multi class') self.__log_roc_curve(y_test, y_pred) if (finish_existing_run): self.complete_run(fitted_model, upload_model = upload_model) if return_predictions: return y_pred def add_training_plots(self, fitted_model, metrics=None, save_image: bool = False): ''' Add the training plots to the Run history Args: fitted_model (Keras model): the fitted model that contains the training history metrics (list): the metrics that should be tracked to the run. If None, all available metrics will be taken ''' history = fitted_model.history if metrics is None: metrics = history.history.keys() for metric in metrics: if(metric in history.history.keys()): self.__current_run.log_table(f'Plot {metric}', {metric: history.history[metric]}) if(save_image and not metric.startswith('val_') and metric in history.history.keys()): plt.plot(history.history[metric]) plt.plot(history.history[f'val_{metric}']) plt.title(f'model {metric}') plt.ylabel(metric) plt.xlabel('epoch') plt.legend(['train', 'test'], loc='upper left') #plt.show() self.__current_run.log_image(f'model {metric}', plot=plt) plt.close() def evaluate_image_classifier(self, fitted_model, X_test: np.array, y_test: np.array, show_roc: bool = False, failed_classifications_to_save: int = 0, image_shape = None, save_curves_as_image: bool = False, class_names: np.array = None, finish_existing_run: bool = True, upload_model: bool = True, return_predictions: bool = False) -> np.array: ''' Will predict and evaluate a model against a test set and save all results to the active Run on AzureML Args: fitted_model (model): The already fitted model to be tested. Sklearn and Keras models have been tested X_test (np.array): The test set to calculate the predictions with y_test (np.array): The output test set to evaluate the predictions against show_roc (bool): This will upload the ROC curve to the run in case of a binary classifier failed_classifications_to_save (int): If greather than 0, this amount of incorrectly classified images will be tracked to the Run image_shape ((int, int, int)): Indicates if images should be reshaped before saving them class_names (np.array): The class names that will be used in the description. If not provided, the unique values of the y_test matrix will be used finish_existing_run (bool): Will complete the existing run on AzureML (defaults to True) upload_model (bool): This will upload the model (pkl file) to AzureML run (defaults to True) Returns: np.array: The predicted (y_pred) values against the model ''' from arcus.ml.images import explorer y_pred = self.evaluate_classifier(fitted_model, X_test, y_test, show_roc=show_roc, save_curves_as_image=save_curves_as_image, class_names= class_names, finish_existing_run=False, upload_model=upload_model, return_predictions=True) if failed_classifications_to_save > 0: # Take incorrect classified images and save import random incorrect_predictions = [i for i, item in enumerate(y_pred) if item != y_test[i]] total_images = min(len(incorrect_predictions), failed_classifications_to_save) for i in random.sample(incorrect_predictions, total_images): pred_class = y_pred[i] act_class = y_test[i] if class_names is not None: pred_class = class_names[pred_class] act_class = class_names[act_class] if image_shape is not None: # Reshape image before saving it imgplot = explorer.show_image(X_test[i].reshape(image_shape), silent_mode=True) else: imgplot = explorer.show_image(X_test[i], silent_mode=True) description = f'Predicted {pred_class} - Actual {act_class}' self.__current_run.log_image(description, plot=imgplot) if return_predictions: return y_pred def __stack_images(self, img1: np.array, img2: np.array): ha,wa = img1.shape[:2] hb,wb = img2.shape[:2] max_width = np.max([wa, wb]) total_height = ha+hb new_img = np.zeros(shape=(total_height, max_width, 3)) new_img[:ha,:wa]=img1 new_img[ha:hb+ha,:wb]=img2 return new_img def __concat_images(self, image_list: np.array) -> np.array: output = None for i, img in enumerate(image_list): if i==0: output = img else: output = self.__stack_images(output, img) return output def save_image_outputs(self, X_test: np.array, y_test: np.array, y_pred: np.array, samples_to_save: int = 1) -> np.array: ''' Will save image outputs to the run Args: X_test (np.array): The input images for the model y_test (np.array): The actual expected output images of the model y_pred (np.array): The predicted or calculated output images of the model samples_to_save (int): If greather than 0, this amount of input, output and generated image combinations will be tracked to the Run ''' from arcus.ml.images import explorer if samples_to_save > 0: import random total_images = min(len(y_pred), samples_to_save) for i in random.sample(range(len(y_pred)), total_images): newimg = self.__concat_images([X_test[i], y_test[i], y_pred[i]]) imgplot = explorer.show_image(newimg, silent_mode=True) self.__current_run.log_image(f'Image combo sample {i}', plot=imgplot) imgplot.close() def setup_training(self, training_name: str, overwrite: bool = False): ''' Will initialize a new directory (using the given training_name) and add a training script and requirements file to run training Args: training_name (str): The name of a training. This will be used to create a directory. Can contain subdirectory overwrite (bool): Defines if the existing training files should be overwritten ''' if not os.path.exists(training_name): os.makedirs(training_name) # Take default training script and copy to the new folder default_training_script_file = os.path.join(str(os.path.dirname(__file__)), 'resources/train.py') default_requirements_file = os.path.join(str(os.path.dirname(__file__)), 'resources/requirements.txt') dest_training_script_file = os.path.join(training_name, 'train.py') dest_requirements_file = os.path.join(training_name, 'requirements.txt') if overwrite or not(os.path.isfile(dest_training_script_file)): shutil.copy2(default_training_script_file, training_name) if overwrite or not(os.path.isfile(dest_requirements_file)): shutil.copy2(default_requirements_file, training_name) def start_training(self, training_name: str, environment_type: str = None, input_datasets: np.array = None, input_datasets_to_download: np.array = None, compute_target:str='local', gpu_compute: bool = False, script_parameters: dict = None, show_widget: bool = True, use_estimator: bool = False, **kwargs): ''' Will start a new training, taking the training name as the folder of the run Args: training_name (str): The name of a training. This will be used to create a directory. Can contain subdirectory environment_type (str): either the name of an existing environment that will be taken as base, or one of these values (tensorflow, sklearn, pytorch). input_datasets (np.array): An array of data set names that will be mounted on the compute in a directory of the dataset name input_datasets_to_download (np.array): An array of data set names that will be downloaded to the compute in a directory of the dataset name compute_target (str): The compute target (default = 'local') on which the training should be executed gpu_compute (bool): Indicates if GPU compute is required for this script or not script_parameters (dict): A dictionary of key/value parameters that will be passed as arguments to the training script show_widget (bool): Will display the live tracking of the submitted Run Returns: Run : the submitted run ''' if use_estimator: print('Scheduling Estimator training') self._start_estimator_training(training_name, environment_type, input_datasets, input_datasets_to_download, compute_target, gpu_compute, script_parameters, show_widget, **kwargs) else: print('Scheduling ScriptRunConfig training') self._start_environment_training(training_name, environment_type, input_datasets, input_datasets_to_download, compute_target, gpu_compute, script_parameters, show_widget, **kwargs) if script_parameters is not None: for arg in script_parameters.keys(): self.__current_run.log(arg.replace('--', ''), script_parameters[arg]) print(self.__current_run.get_portal_url()) if(show_widget): from azureml.widgets import RunDetails RunDetails(self.__current_run).show() return self.__current_run def _start_environment_training(self, training_name: str, environment_type: str = None, input_datasets: np.array = None, input_datasets_to_download: np.array = None, compute_target:str='local', gpu_compute: bool = False, script_parameters: dict = None, show_widget: bool = True, **kwargs): ''' Will start a new training using ScriptRunConfig, taking the training name as the folder of the run Args: training_name (str): The name of a training. This will be used to create a directory. Can contain subdirectory environment_type (str): either the name of an existing environment that will be taken as base, or one of these values (tensorflow, sklearn, pytorch). input_datasets (np.array): An array of data set names that will be mounted on the compute in a directory of the dataset name input_datasets_to_download (np.array): An array of data set names that will be downloaded to the compute in a directory of the dataset name compute_target (str): The compute target (default = 'local') on which the training should be executed gpu_compute (bool): Indicates if GPU compute is required for this script or not script_parameters (dict): A dictionary of key/value parameters that will be passed as arguments to the training script show_widget (bool): Will display the live tracking of the submitted Run ''' from azureml.train.estimator import Estimator from azureml.core import Environment, ScriptRunConfig from azureml.core.runconfig import RunConfiguration from azureml.core.runconfig import DataReferenceConfiguration from azureml.core.runconfig import CondaDependencies from arcus.azureml.experimenting import train_environment as te # Check if directory exists if not(os.path.exists(training_name) and os.path.isdir(training_name)): raise FileNotFoundError(training_name) # Check compute target if compute_target != 'local': self.__check_compute_target(compute_target, gpu_compute) training_env = te.get_training_environment(self.__workspace, training_name, os.path.join(training_name, 'requirements.txt'), use_gpu=gpu_compute, include_prerelease=True, environment_type=environment_type) runconfig = RunConfiguration() # Add datasets datarefs = dict() scriptargs = list() if script_parameters is not None: for key in script_parameters.keys(): scriptargs.append(key) scriptargs.append(script_parameters[key]) if(input_datasets is not None): for ds in input_datasets: print(f'Adding mounting data reference for dataset {ds}') # scriptargs.append(ds) scriptargs.append(self.__workspace.datasets[ds].as_named_input(ds).as_mount(path_on_compute = ds)) # datastore, path = self._get_data_reference(self.__workspace.datasets[ds]) # datarefs[ds] = DataReferenceConfiguration(datastore_name=datastore, path_on_datastore = path, path_on_compute = '/' + ds, mode = 'mount', overwrite = False) if(input_datasets_to_download is not None): for ds in input_datasets_to_download: print(f'Adding download data reference for dataset {ds}') # scriptargs.append(ds) scriptargs.append(self.__workspace.datasets[ds].as_named_input(ds).as_download(path_on_compute = ds)) scriptrunconfig = ScriptRunConfig(source_directory='./' + training_name, script="train.py", run_config=runconfig, arguments=scriptargs) scriptrunconfig.run_config.target = compute_target scriptrunconfig.run_config.environment = training_env #scriptrunconfig.run_config.data_references = datarefs # Submit training self.__current_run = self.__experiment.submit(scriptrunconfig) def _get_data_reference(self, dataset: Dataset): import json j = json.loads(str(dataset).replace('FileDataset\n', '')) source = j['source'][0] sections = source.split("'") return sections[1], sections[3] def _start_estimator_training(self, training_name: str, estimator_type: str = None, input_datasets: np.array = None, input_datasets_to_download: np.array = None, compute_target:str='local', gpu_compute: bool = False, script_parameters: dict = None, show_widget: bool = True, **kwargs): ''' Will start a new training using an Estimator, taking the training name as the folder of the run Args: training_name (str): The name of a training. This will be used to create a directory. Can contain subdirectory environment_type (str): one of these values (tensorflow, sklearn, pytorch). input_datasets (np.array): An array of data set names that will be mounted on the compute in a directory of the dataset name input_datasets_to_download (np.array): An array of data set names that will be downloaded to the compute in a directory of the dataset name compute_target (str): The compute target (default = 'local') on which the training should be executed gpu_compute (bool): Indicates if GPU compute is required for this script or not script_parameters (dict): A dictionary of key/value parameters that will be passed as arguments to the training script show_widget (bool): Will display the live tracking of the submitted Run ''' from azureml.train.estimator import Estimator # Check if directory exists if not(os.path.exists(training_name) and os.path.isdir(training_name)): raise FileNotFoundError(training_name) # Check compute target if compute_target != 'local': self.__check_compute_target(compute_target, gpu_compute) # Add datasets datasets = list() if(input_datasets is not None): for ds in input_datasets: datasets.append(self.__workspace.datasets[ds].as_named_input(ds).as_mount(path_on_compute=ds)) if(input_datasets_to_download is not None): for ds in input_datasets_to_download: datasets.append(self.__workspace.datasets[ds].as_named_input(ds).as_download(path_on_compute=ds)) # as mount - as download constructor_parameters = { 'source_directory':training_name, 'script_params':script_parameters, 'inputs':datasets, 'compute_target':compute_target, 'entry_script':'train.py', 'pip_requirements_file':'requirements.txt', 'use_gpu':gpu_compute, 'use_docker':True} print('Creating estimator of type', estimator_type) if(estimator_type is None): # Using default Estimator estimator = Estimator(**constructor_parameters) elif(estimator_type == 'tensorflow'): from azureml.train.dnn import TensorFlow version_par = 'framework_version' if(not version_par in constructor_parameters.keys()): print('Defaulting to version 2.0 for TensorFlow') constructor_parameters[version_par] = '2.0' estimator = TensorFlow(**constructor_parameters) elif(estimator_type == 'sklearn'): from azureml.train.sklearn import SKLearn estimator = SKLearn(**constructor_parameters) elif(estimator_type == 'pytorch'): from azureml.train.dnn import PyTorch estimator = PyTorch(**constructor_parameters) # Submit training self.__current_run = self.__experiment.submit(estimator) # protected implementation methods def _log_metrics(self, metric_name: str, metric_value: float, description:str = None): print(metric_name, metric_value) self.__current_run.log(metric_name, metric_value, description=description) def _complete_run(self): ''' Completes the current run ''' self.__current_run.complete() def _log_confmatrix(self, confusion_matrix: np.array, class_names: np.array): data = {} data['schema_type'] = 'confusion_matrix' data['schema_version'] = 'v1' data['data'] = {} data['data']['class_labels'] = class_names.tolist() data['data']['matrix'] = confusion_matrix.tolist() print(confusion_matrix) json_data = json.dumps(data) self.__current_run.log_confusion_matrix('Confusion matrix', json_data, description='') def _save_roc_curve(self, roc_auc: float, roc_plot: plt): self._log_metrics('roc_auc', roc_auc) self.__current_run.log_image('ROC Curve', plot=plt) def __check_compute_target(self, compute_target, use_gpu: bool): __vm_size = '' if isinstance(compute_target, AmlCompute): __vm_size = compute_target.vm_size elif isinstance(compute_target, str): compute = ComputeTarget(workspace=self.__workspace, name=compute_target) __vm_size = compute.vm_size if self.__vm_size_list is None: self.__vm_size_list = AmlCompute.supported_vmsizes(self.__workspace) vm_description = list(filter(lambda vmsize: str.upper(vmsize['name']) == str.upper(__vm_size), self.__vm_size_list))[0] if(use_gpu and vm_description['gpus'] == 0): raise errors.TrainingComputeException(f'gpu_compute was specified, but the target does not have GPUs: {vm_description} ') if(not (use_gpu) and vm_description['vCPUs'] == 0): raise errors.TrainingComputeException(f'cpu_compute was specified, but the target does not have CPUs: {vm_description} ') def __log_roc_curve(self, y_pred: np.array, y_test: np.array): '''Will upload the Receiver Operating Characteristic (ROC) Curve for binary classifiers Args: y_pred (np.array): The predicted values of the test set y_test (np.array): The actual outputs of the test set Returns: float: The ROC_AUC value ''' # calculate the fpr and tpr for all thresholds of the classification fpr, tpr, threshold = metrics.roc_curve(y_test, y_pred) roc_auc = metrics.auc(fpr, tpr) plt.cla() plt.title('Receiver Operating Characteristic') plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc) plt.legend(loc = 'lower right') plt.plot([0, 1], [0, 1],'r--') plt.xlim([0, 1]) plt.ylim([0, 1]) plt.ylabel('True Positive Rate') plt.xlabel('False Positive Rate') self._save_roc_curve(roc_auc, plt) plt.show(block=False) plt.close() return roc_auc
def create_and_submit_experiment(workspace: Workspace, azure_config: AzureConfig, source_config: SourceConfig, model_config_overrides: str, azure_dataset_id: str) -> Run: """ Creates an AzureML experiment in the provided workspace and submits it for execution. :param workspace: configured workspace to use to run the experiment in :param azure_config: azure related configurations to setup valid workspace :param source_config: The information about which code should be submitted, and which arguments should be used. :param model_config_overrides: A string that describes which model parameters were overwritten by commandline arguments in the present run. This is only used for diagnostic purposes (it is set as a Tag on the run). :param azure_dataset_id: The name of the dataset in blob storage to be used for this run. :returns: Run object for the submitted AzureML run """ experiment_name = create_experiment_name(azure_config) exp = Experiment(workspace=workspace, name=azure_util.to_azure_friendly_string(experiment_name)) pt_env = create_pytorch_environment(workspace, azure_config, source_config, azure_dataset_id) # submit a training/testing run associated with the experiment run: Run = exp.submit(pt_env) # set metadata for the run set_run_tags(run, azure_config, model_config_overrides) print("\nSuccessfully queued new run for experiment: {}".format(exp.name)) print( "==============================================================================" ) if azure_config.run_recovery_id: print(f"\nRecovered from: {azure_config.run_recovery_id}") recovery_id = azure_util.create_run_recovery_id(run) recovery_file = Path(RUN_RECOVERY_FILE) if recovery_file.exists(): recovery_file.unlink() recovery_file.write_text(recovery_id) print( "==============================================================================" ) print("Experiment URL: {}".format(exp.get_portal_url())) print("Run URL: {}".format(run.get_portal_url())) print( "If this run fails, re-start runner.py and supply these additional arguments: " f"--run_recovery_id={recovery_id}") print( f"The run recovery ID has been written to this file: {recovery_file}") print( "==============================================================================" ) if azure_config.tensorboard and azure_config.azureml: print("Starting TensorBoard now because you specified --tensorboard") monitor(monitor_config=AMLTensorBoardMonitorConfig(run_ids=[run.id]), azure_config=azure_config) else: print( f"To monitor this run locally using TensorBoard, run the script: " f"InnerEye/Azure/tensorboard_monitor.py --run_ids={run.id}") print( "==============================================================================" ) return run
# Configuring a PythonScriptStep with a RunConfiguration # that includes debugpy and azure-debug-relay run_config = RunConfiguration() conda_dependencies = run_config.environment.python.conda_dependencies conda_dependencies.add_conda_package("pip") conda_dependencies.add_conda_package("scikit-learn") conda_dependencies.add_pip_package("azureml-sdk==" + amlcore.__version__) conda_dependencies.add_pip_package("azureml-defaults") train_step = PythonScriptStep(name='Train Step with Debugging', script_name="diabetes_train_2.py", source_directory="./scripts", compute_target=compute_target, runconfig=run_config, allow_reuse=False) print('About to submit') # Submitting an Azure ML Pipeline Run step_sequence = StepSequence(steps=[train_step]) pipeline = Pipeline(workspace, steps=step_sequence) experiment = Experiment(workspace=workspace, name=experiment_name) run = experiment.submit(pipeline) print('submitted') # Show the running experiment run in the notebook widget #RunDetails(run).show() # Block until the experiment run has completed run.wait_for_completion()
est = ScriptRunConfig( source_directory=os.path.dirname(os.path.realpath(__file__)), arguments=[ "--models", models, '--data_folder_train', 'DatasetConsumptionConfig:{}'.format(input_name_train), '--data_folder_test', 'DatasetConsumptionConfig:{}'.format(input_name_test), '--local', 'no' ], run_config=run_config) # Define the ML experiment experiment = Experiment(workspace, "explore_" + models) # Submit experiment run, if compute is idle, this may take some time') run = experiment.submit(est) if models == 'deeplearning': dataset_train = Dataset.get_by_name(workspace, name=input_name_train) dataset_test = Dataset.get_by_name(workspace, name=input_name_test) # define script parameters script_params_3 = { '--models': models, '--data_folder_train': dataset_train.as_named_input('train').as_mount(), '--data_folder_test': dataset_test.as_named_input('test').as_mount(), '--local': 'no' } estimator = PyTorch(
def my_azure_app(cfg: DictConfig) -> None: print(cfg.pretty()) args_dict = OmegaConf.to_container(cfg, resolve=False) yaml_file_nm = args_dict["yaml_file"].split("/")[-1].split(".")[0] conf_file = os.path.join( args_dict["root_path"], yaml_file_nm + "_" + str(datetime.datetime.now()) + ".json", ) print(conf_file) with open(conf_file, "w") as out: out.write(json.dumps(args_dict)) # First, list the supported VM families for Azure Machine Learning Compute # ws = Workspace.get('experiments') cluster_name = "gpucluster" experiment_name = args_dict["experiment_name"] + "_azure" disable_gpu = args_dict["disable_gpu"] script_folder = "." # todo. this is overriden by hydra script_folder = (hydra.utils.get_original_cwd() ) # todo. this is overriden by hydra data_path = os.path.join(args_dict["root_path"], args_dict["data_subdir"]) sub_id = os.getenv("AZ_SUBS_ID") assert sub_id is not None # Edit a run configuration property on the fly. run_local = RunConfiguration() run_local.environment.python.user_managed_dependencies = True ws = Workspace.get( name="experiments", subscription_id=sub_id, resource_group="default_resource_group", ) # print(AmlCompute.supported_vmsizes(workspace=ws)) # Create a new runconfig object _ = RunConfiguration() # Signal that you want to use AmlCompute to execute the script # run_temp_compute.target = "amlcompute" # AmlCompute is created in the same region as your workspace # Set the VM size for AmlCompute from the list of supported_vmsizes try: compute_target = ComputeTarget(workspace=ws, name=cluster_name) print("Found existing compute target") except ComputeTargetException: print("Creating a new compute target...") compute_config = AmlCompute.provisioning_configuration( vm_size=args_dict["vm_size"], max_nodes=1) compute_target = ComputeTarget.create(ws, cluster_name, compute_config) compute_target.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=10) s = ws.get_default_datastore() # A reference to the root_path in azure after uplaoding _ = s.upload( src_dir=data_path, target_path=data_path, overwrite=False, show_progress=True, ) # All path except file_name # script_target_path = "/".join(args_dict['yaml_file'].split("/")[:-1]) script_target_path = "/".join( conf_file.split("/")[:-1]) # All path except file_name print(script_target_path) # script_fname = args.config_file.split("/")[-1] script_fname = conf_file.split("/")[-1] print(script_fname) print("---" * 100) azure_script_path = s.upload_files( files=[conf_file], target_path=script_target_path, overwrite=True, show_progress=True, ) print(azure_script_path) azure_script_abs_path = DataReference(datastore=s, data_reference_name="input_data", path_on_datastore=conf_file) azure_root_path = DataReference( datastore=s, data_reference_name="root_data", path_on_datastore=args_dict["root_path"], ) exp = Experiment(workspace=ws, name=experiment_name) # src = ScriptRunConfig(source_directory = script_folder, # script = 'run.py', arguments=['--config_file', 'local/pairs.json'], # run_config = run_temp_compute) # Using pytorch estimator - proper way to submit pytorch jobs script_params = { "--config_file": azure_script_abs_path, "--root_path": azure_root_path, "--experiment_name": experiment_name, } print("GPU Disabled: {}".format(disable_gpu)) estimator = PyTorch( source_directory=script_folder, script_params=script_params, compute_target=compute_target, entry_script="run.py", use_gpu=not disable_gpu, pip_packages=["pillow==5.4.1"], ) # you can name this as run _ = exp.submit(estimator)
script_params = { '--data-folder': ws.get_default_datastore().as_mount(), '--batch-size': 50, '--first-layer-neurons': 300, '--second-layer-neurons': 100, '--learning-rate': 0.01 } est = TensorFlow(source_directory=script_folder, script_params=script_params, compute_target=compute_target, entry_script='tf_mnist.py', use_gpu=True, framework_version='1.12') run = exp.submit(est) run.wait_for_completion(show_output=True, wait_post_processing=True) # Raise exception if run fails if run.get_status() == "Failed": raise Exception( "Training on local failed with following run status: {} and logs: \n {}".format( run.get_status(), run.get_details_with_logs() ) ) # Writing the run id to /aml_config/run_id.json run_id = {} run_id["run_id"] = run.id
entry_script='retrain.py', pip_packages=['tensorflow_hub'], node_count=1, use_gpu=True) # Overwrite data store reference dr = DataReferenceConfiguration( datastore_name=ds.name, path_on_datastore='flower_photos', mode='download', # download files from datastore to compute target overwrite=True) estimator.run_config.data_references[ds.name] = dr # Submit Experiment print("Training the model...") run = experiment.submit(estimator) run.wait_for_completion(show_output=True) print("Waiting for the run to complete...") status = run.get_status() while status != 'Completed' and status != 'Failed': print('current status: {} - waiting...'.format(run.get_status())) time.sleep(30) status = run.get_status() # Download results print("Downloading the results...") for filename in run.get_file_names(): if filename.startswith('outputs'): print("downloading", filename, '...') run.download_file(filename,
# create a new RunConfig object conda_run_config = RunConfiguration(framework="python") # Set compute target to the Linux DSVM conda_run_config.target = dsvm_compute.name # set the data reference of the run coonfiguration conda_run_config.data_references = {ds.name: dr} # specify conda packages to install on the VM conda_run_config.environment.python.conda_dependencies = CondaDependencies.create( conda_packages=ast.literal_eval(config['train']['conda_packages'])) from azureml.core import Run from azureml.core import ScriptRunConfig src = ScriptRunConfig( source_directory='./', script=config['train']['script'], run_config=conda_run_config, # pass the datastore reference as a parameter to the training script arguments=['--data-folder', str(ds.as_download())]) run = exp.submit(config=src) run.wait_for_completion(show_output=True) # Register the model print('Registering model...') model = run.register_model(model_name=config['train']['model_name'], model_path='./outputs/ridge_1.pkl') print('Done registering model.')
# Get the training dataset diabetes_ds = ws.datasets.get("diabetes dataset") # Create an estimator estimator = Estimator(source_directory=experiment_folder, inputs=[diabetes_ds.as_named_input('diabetes')], script_params=script_params, compute_target = 'local', environment_definition = diabetes_env, entry_script='diabetes_training.py') # Create an experiment experiment = Experiment(workspace = ws, name = 'diabetes-training') # Run the experiment run = experiment.submit(config=estimator) # Show the run details while running RunDetails(run).show() run.wait_for_completion() # Register the environment diabetes_env.register(workspace=ws) #run on remote compute #check for existing from azureml.core.compute import ComputeTarget, AmlCompute from azureml.core.compute_target import ComputeTargetException
def main(req: func.HttpRequest) -> (func.HttpResponse): logging.info('Python HTTP trigger function processed a request.') # For now this can be a POST where we have <base url>/api/HttpTrigger?start=<any string> image_url = req.params.get('start') logging.info(type(image_url)) # Use service principal secrets to create authentication vehicle and # define workspace object try: svc_pr = ServicePrincipalAuthentication( tenant_id=os.getenv('TENANT_ID', ''), service_principal_id=os.getenv('APP_ID', ''), service_principal_password=os.getenv('PRINCIPAL_PASSWORD', '')) ws = Workspace(subscription_id=os.getenv('AZURE_SUB', ''), resource_group=os.getenv('RESOURCE_GROUP', ''), workspace_name=os.getenv('WORKSPACE_NAME',''), auth=svc_pr) print("Found workspace {} at location {} using Azure CLI \ authentication".format(ws.name, ws.location)) # Usually because authentication didn't work except ProjectSystemException as err: print('Authentication did not work.') return json.dumps('ProjectSystemException') # Need to create the workspace except Exception as err: ws = Workspace.create(name=os.getenv('WORKSPACE_NAME', ''), subscription_id=os.getenv('AZURE_SUB', ''), resource_group=os.getenv('RESOURCE_GROUP', ''), create_resource_group=True, location='westus', # Or other supported Azure region auth=svc_pr) print("Created workspace {} at location {}".format(ws.name, ws.location)) # choose a name for your cluster - under 16 characters cluster_name = "gpuforpytorch" try: compute_target = ComputeTarget(workspace=ws, name=cluster_name) print('Found existing compute target.') except ComputeTargetException: print('Creating a new compute target...') # AML Compute config - if max_nodes are set, it becomes persistent storage that scales compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_NC6', min_nodes=0, max_nodes=2) # create the cluster compute_target = ComputeTarget.create(ws, cluster_name, compute_config) compute_target.wait_for_completion(show_output=True) # use get_status() to get a detailed status for the current cluster. # print(compute_target.get_status().serialize()) # # Create a project directory and copy training script to ii project_folder = os.path.join(os.getcwd(), 'HttpTrigger', 'project') # os.makedirs(project_folder, exist_ok=True) # shutil.copy(os.path.join(os.getcwd(), 'HttpTrigger', 'pytorch_train.py'), project_folder) # Create an experiment experiment_name = 'fish-no-fish' experiment = Experiment(ws, name=experiment_name) # Use an AML Data Store for training data ds = Datastore.register_azure_blob_container(workspace=ws, datastore_name='funcdefaultdatastore', container_name=os.getenv('STORAGE_CONTAINER_NAME_TRAINDATA', ''), account_name=os.getenv('STORAGE_ACCOUNT_NAME', ''), account_key=os.getenv('STORAGE_ACCOUNT_KEY', ''), create_if_not_exists=True) # Use an AML Data Store to save models back up to ds_models = Datastore.register_azure_blob_container(workspace=ws, datastore_name='modelsdatastorage', container_name=os.getenv('STORAGE_CONTAINER_NAME_MODELS', ''), account_name=os.getenv('STORAGE_ACCOUNT_NAME', ''), account_key=os.getenv('STORAGE_ACCOUNT_KEY', ''), create_if_not_exists=True) # Set up for training ("trans" flag means - use transfer learning and # this should download a model on compute) # Using /tmp to store model and info due to the fact that # creating new folders and files on the Azure Function host # will trigger the function to restart. script_params = { '--data_dir': ds.as_mount(), '--num_epochs': 30, '--learning_rate': 0.01, '--output_dir': '/tmp/outputs', '--trans': 'True' } # Instantiate PyTorch estimator with upload of final model to # a specified blob storage container (this can be anything) estimator = PyTorch(source_directory=project_folder, script_params=script_params, compute_target=compute_target, entry_script='pytorch_train.py', use_gpu=True, inputs=[ds_models.as_upload(path_on_compute='./outputs/model_finetuned.pth')]) run = experiment.submit(estimator) print(run.get_details()) # # The following would certainly be blocking, but that's ok for debugging # while run.get_status() not in ['Completed', 'Failed']: # For example purposes only, not exhaustive # print('Run {} not in terminal state'.format(run.id)) # time.sleep(10) return json.dumps(run.get_status())
async def __create_cluster(self): self.__print_message("Setting up cluster") exp = Experiment(self.workspace, self.experiment_name) estimator = Estimator( os.path.join(self.abs_path, "setup"), compute_target=self.compute_target, entry_script="start_scheduler.py", environment_definition=self.environment_definition, script_params=self.scheduler_params, node_count=1, ### start only scheduler distributed_training=MpiConfiguration(), use_docker=True, inputs=self.datastores, ) run = exp.submit(estimator, tags=self.tags) self.__print_message("Waiting for scheduler node's IP") status = run.get_status() while (status != "Canceled" and status != "Failed" and "scheduler" not in run.get_metrics()): print(".", end="") logger.info("Scheduler not ready") time.sleep(5) status = run.get_status() if status == "Canceled" or status == "Failed": run_error = run.get_details().get("error") error_message = "Failed to start the AzureML cluster." if run_error: error_message = "{} {}".format(error_message, run_error) logger.exception(error_message) if not self.compute_target_set: self.__delete_compute_target() raise Exception(error_message) print("\n") ### SET FLAGS self.scheduler_ip_port = run.get_metrics()["scheduler"] self.worker_params["--scheduler_ip_port"] = self.scheduler_ip_port self.__print_message(f'Scheduler: {run.get_metrics()["scheduler"]}') self.run = run ### CHECK IF ON THE SAME VNET max_retry = 5 while self.same_vnet is None and max_retry > 0: time.sleep(5) await self.sync(self.__check_if_scheduler_ip_reachable) max_retry -= 1 if self.same_vnet is None: self.run.cancel() if not self.compute_target_set: self.__delete_compute_target() logger.exception( "Connection error after retrying. Failed to start the AzureML cluster." ) return ### REQUIRED BY dask.distributed.deploy.cluster.Cluster self.hostname = socket.gethostname() self.is_in_ci = ( f"/mnt/batch/tasks/shared/LS_root/mounts/clusters/{self.hostname}" in os.getcwd()) _scheduler = self.__prepare_rpc_connection_to_headnode() self.scheduler_comm = rpc(_scheduler) await self.sync(self.__setup_port_forwarding) try: await super()._start() except Exception as e: logger.exception(e) # CLEAN UP COMPUTE TARGET self.run.cancel() if not self.compute_target_set: self.__delete_compute_target() return await self.sync(self.__update_links) self.__print_message("Connections established") self.__print_message(f"Scaling to {self.initial_node_count} workers") if self.initial_node_count > 1: self.scale(self.initial_node_count ) # LOGIC TO KEEP PROPER TRACK OF WORKERS IN `scale` self.__print_message("Scaling is done")
use_estimator = True if use_estimator: if cv: script_params = {'--cv': cv} node_count = cv + 2 # dask-mpi uses 2 nodes for its scheduler and client distributed_training = MpiConfiguration() else: script_params = None node_count = None distributed_training = None to_run = Estimator(source_directory='.', compute_target=compute_target, entry_script='train.py', script_params=script_params, node_count=node_count, use_gpu=False, conda_dependencies_file='env.yml', distributed_training=distributed_training) else: if cv: arguments = ['--cv', str(cv)] else: arguments = [] to_run = ScriptRunConfig(source_directory='.', script='train.py', arguments=arguments, run_config=run_conf) run = exp.submit(to_run) run.wait_for_completion(show_output=True)
run_config = RunConfiguration(framework="python") run_config.target = compute_target run_config.environment.docker.enabled = True run_config.environment.docker.base_image = azureml.core.runconfig.DEFAULT_CPU_IMAGE run_config.environment.environment_variables = script_env dependencies = CondaDependencies.create( pip_packages=["scikit-learn", "scipy", "numpy"]) run_config.environment.python.conda_dependencies = dependencies # Submit training automl_config_common = { 'task': 'forecasting', 'primary_metric': 'normalized_root_mean_squared_error', 'verbosity': logging.INFO, 'time_column_name': time_column_name, 'max_horizon': horizon, 'iterations': 10, 'n_cross_validations': 5, 'enable_ensembling': True } automl_config = AutoMLConfig(path=script_folder, data_script='get_data.py', compute_target=compute_target, run_configuration=run_config, **automl_config_common) exp = Experiment(workspace=ws, name=experiment_name) run = exp.submit(automl_config, show_output=True) best_run, fitted_model = run.get_output()
environment=env) param_sampling = RandomParameterSampling({ "--num-topics": choice(5, 10, 15, 20) }) # Submit experiment hd = HyperDriveConfig(run_config=src, hyperparameter_sampling=param_sampling, primary_metric_name="c_v", primary_metric_goal=PrimaryMetricGoal.MAXIMIZE, max_total_runs=100, max_concurrent_runs=4) run = exp.submit(config=hd) run.wait_for_completion(show_output=False) print(run.get_metrics()) print(run.get_file_names()) # Register model best_run = run.get_best_run_by_primary_metric() model = best_run.register_model(model_name='gensim_lda', model_path='outputs') print(model.name, model.id, model.version, sep='\t')
get_ipython().run_cell_magic('writefile', '$script_folder/train.py', '\nimport argparse\nimport os\nimport numpy as np\n\nfrom sklearn.cluster import DBSCAN\nfrom sklearn import metrics\nfrom sklearn.datasets.samples_generator import make_blobs\nfrom sklearn.preprocessing import StandardScaler\n\nfrom sklearn.externals import joblib\n\nfrom azureml.core import Run\nfrom utils import load_data\n\nimport subprocess\nimport sys\n\ndef install(package):\n subprocess.call([sys.executable, "-m", "pip", "install", package])\n\ninstall(\'pandas\')\ninstall(\'azure-storage\')\ninstall(\'tables\')\n\nimport pandas as pd\n\n# let user feed in parameters, the location of the data files (from datastore),\nparser = argparse.ArgumentParser()\nparser.add_argument(\'--data-folder\', type=str, dest=\'data_folder\', help=\'data folder mounting point\')\nargs = parser.parse_args()\n\ndata_folder = args.data_folder\nprint(\'Data folder:\', data_folder)\n\n# load train and test set into numpy arrays\nData_training = pd.read_csv(os.path.join(data_folder, \'data.csv\'))\nData_training = StandardScaler().fit_transform(Data_training)\n\n# get hold of the current run\nrun = Run.get_context()\n\ndb = DBSCAN(eps=2, min_samples=10).fit(Data_training)\ncore_samples_mask = np.zeros_like(db.labels_, dtype=bool)\ncore_samples_mask[db.core_sample_indices_] = True\nlabels = db.labels_\n\nn_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)\nn_noise_ = list(labels).count(-1)\n\nprint(\'Estimated number of clusters: %d\' % n_clusters_)\nprint(\'Estimated number of noise points: %d\' % n_noise_)\n\nprint("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(Data_training, labels))\n\ndb.core_sample_indices_\ndb.components_\n\nos.makedirs(\'outputs\', exist_ok=True)\npd.DataFrame(db.components_).to_csv("outputs/model.csv", header=None, index=None)\n\nfrom azure.storage.blob import BlockBlobService\nimport tables\n\nSTORAGEACCOUNTNAME = "datatest123"\nLOCALFILENAME = "outputs/model.csv"\nSTORAGEACCOUNTKEY = "DHfLH+rw0qOUya7ihZQp5+7lA4Ezo1hdonfqsQZGw+HZ6vORqjMJpzgSQ/kxIiRDoWFEQzHI7P7xIzRlVWW08w=="\nCONTAINERNAME= "testconta"\nBLOBNAME= "model/model.csv"\n\noutput_blob_service=BlockBlobService(account_name=STORAGEACCOUNTNAME,account_key=STORAGEACCOUNTKEY) \nlocalfileprocessed = os.path.join(os.getcwd(),LOCALFILENAME) #assuming file is in current working directory\ntry:\n output_blob_service.create_blob_from_path(CONTAINERNAME,BLOBNAME,localfileprocessed)\nexcept: \n print ("Something went wrong with uploading to the blob:"+ BLOBNAME)\n\n# note file saved in the outputs folder is automatically uploaded into experiment record\n# joblib.dump(value=clf, filename=\'outputs/sklearn_mnist_model.pkl\')') import shutil shutil.copy('utils.py', script_folder) from azureml.train.sklearn import SKLearn script_params = { '--data-folder': ds.path('dbscndata').as_mount() } #establish the estimator for learning est = SKLearn(source_directory=script_folder, script_params=script_params, compute_target=compute_target, entry_script='train.py') print(ds.path('dbscndata').as_mount()) # upload the estimator run = exp.submit(config=est) run # start training process from azureml.widgets import RunDetails RunDetails(run).show() # register model model = run.register_model(model_name='dbscan', model_path='outputs/model.csv') print(model.name, model.id, model.version, sep='\t')
output['Subscription ID'] = ws.subscription_id output['Workspace'] = ws.name output['Resource Group'] = ws.resource_group output['Location'] = ws.location output['Project Directory'] = project_folder pd.set_option('display.max_colwidth', -1) pd.DataFrame(data=output, index=['']).T # get_data script does this now csv_file = "../data/" + experiment_name + ".csv" automl_settings = { "iteration_timeout_minutes": 10, "iterations": 30, "primary_metric": 'spearman_correlation', "preprocess": True, "verbosity": logging.DEBUG, "n_cross_validations": 5 } dflow = dprep.read_csv automl_config = AutoMLConfig(task='regression', debug_log='automl_errors.log', path=project_folder, compute_target=compute_target, data_script="get_data.py", **automl_settings) experiment = Experiment(ws, 'automl_remote') remote_run = experiment.submit(automl_config, show_output=True)
run_config = RunConfiguration() # signal that you want to use AmlCompute to execute script. run_config.target = "amlcompute" # AmlCompute will be created in the same region as workspace # Set vm size for AmlCompute run_config.amlcompute.vm_size = 'STANDARD_D2_V2' # enable Docker run_config.environment.docker.enabled = True # set Docker base image to the default CPU-based image run_config.environment.docker.base_image = DEFAULT_CPU_IMAGE # use conda_dependencies.yml to create a conda environment in the Docker image for execution run_config.environment.python.user_managed_dependencies = False # auto-prepare the Docker image when used for execution (if it is not already prepared) run_config.auto_prepare_environment = True # specify CondaDependencies obj run_config.environment.python.conda_dependencies = CondaDependencies.create(conda_packages=['scikit-learn']) # Now submit a run on AmlCompute from azureml.core.script_run_config import ScriptRunConfig script_run_config = ScriptRunConfig(source_directory=project_folder, script='train.py', run_config=run_config) run = experiment.submit(script_run_config) run.wait_for_completion()