def get_parameter_search_hyperdrive_config( self, estimator: Estimator) -> HyperDriveConfig: """ Specify an Azure HyperDrive configuration. Further details are described in the tutorial https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-tune-hyperparameters A reference is provided at https://docs.microsoft.com/en-us/python/api/azureml-train-core/azureml.train .hyperdrive?view=azure-ml-py :param estimator: The estimator (configured PyTorch environment) of the experiment. :return: An Azure HyperDrive run configuration (configured PyTorch environment). """ parameter_space = {'l_rate': uniform(0.0005, 0.01)} param_sampling = RandomParameterSampling(parameter_space) # early terminate poorly performing runs early_termination_policy = BanditPolicy(slack_factor=0.15, evaluation_interval=1, delay_evaluation=10) return HyperDriveConfig( estimator=estimator, hyperparameter_sampling=param_sampling, policy=early_termination_policy, primary_metric_name=TrackedMetrics.Val_Loss.value, primary_metric_goal=PrimaryMetricGoal.MINIMIZE, max_total_runs=10, max_concurrent_runs=2)
def hyperparameter_tuning(ws,experiment): # Create and submit a Hyperdrive job cluster = ws.compute_targets[AML.compute_name] script_params={ '--datastore-dir': ws.get_default_datastore().as_mount(), } tf_estimator = TensorFlow(source_directory='scripts', compute_target=cluster, entry_script='train.py', script_params=script_params, use_gpu=True) ps = RandomParameterSampling( { '--learning-rate': loguniform(-15, -3) } ) early_termination_policy = BanditPolicy(slack_factor = 0.15, evaluation_interval=2) hyperdrive_run_config = HyperDriveRunConfig(estimator = tf_estimator, hyperparameter_sampling = ps, policy = early_termination_policy, primary_metric_name = "validation_accuracy", primary_metric_goal = PrimaryMetricGoal.MAXIMIZE, max_total_runs = 20, max_concurrent_runs = 4) hd_run = experiment.submit(hyperdrive_run_config) RunDetails(Run(experiment, hd_run.id)).show() return hd_run
def make_early_term_policy(self, policy_type, eval_interval=1, delay_eval=0, truncation_percentage=.1, slack_factor=None, slack_amount=None): from azureml.train.hyperdrive import BanditPolicy, MedianStoppingPolicy, TruncationSelectionPolicy, NoTerminationPolicy if policy_type == "bandit": policy = BanditPolicy(evaluation_interval=eval_interval, slack_factor=slack_factor, slack_amount=slack_amount, delay_eval=delay_eval) elif policy_type == "median": policy = MedianStoppingPolicy(evaluation_interval=eval_interval, delay_evaluation=delay_eval) elif policy_type == "truncation": policy = TruncationSelectionPolicy( truncation_percentage=truncation_percentage, evaluation_interval=eval_interval, delay_evaluation=delay_eval) elif policy_type == "none": policy = NoTerminationPolicy() else: errors.config_error("Unrecognized policy type=" + policy_type) return policy
def main(epochs, iterations, compute_target, concurrent_runs): cli_auth = AzureCliAuthentication() experiment = Experiment.from_directory(".", auth=cli_auth) ws = experiment.workspace cluster = ws.compute_targets[compute_target] food_data = ws.datastores['food_images'] script_arguments = {"--data-dir": food_data.as_mount(), "--epochs": epochs} tf_est = TensorFlow(source_directory=".", entry_script='code/train/train.py', script_params=script_arguments, compute_target=cluster, conda_packages=['pillow', 'pandas'], pip_packages=['click', 'seaborn'], use_docker=True, use_gpu=True, framework_version='1.13') # Run on subset of food categories tf_est.run_config.arguments.extend( ['apple_pie', 'baby_back_ribs', 'baklava', 'beef_carpaccio']) param_sampler = RandomParameterSampling({ '--minibatch-size': choice(16, 32, 64), '--learning-rate': loguniform(-9, -6), '--optimizer': choice('rmsprop', 'adagrad', 'adam') }) # Create Early Termination Policy etpolicy = BanditPolicy(evaluation_interval=2, slack_factor=0.1) # Create HyperDrive Run Configuration hyper_drive_config = HyperDriveConfig( estimator=tf_est, hyperparameter_sampling=param_sampler, policy=etpolicy, primary_metric_name='acc', primary_metric_goal=PrimaryMetricGoal.MAXIMIZE, max_total_runs=iterations, max_concurrent_runs=concurrent_runs) # Submit the Hyperdrive Run print("Submitting Hyperdrive Run") hd_run = experiment.submit(hyper_drive_config) hd_run.wait_for_completion(raise_on_error=True, show_output=True) print("Finishing Run") best_run = hd_run.get_best_run_by_primary_metric() print(f'##vso[task.setvariable variable=run_id]{best_run.id}')
def get_policy(policy_settings): if "bandit" in policy_settings["name"]: policy = BanditPolicy( evaluation_interval=policy_settings["evaluation_interval"], delay_evaluation=policy_settings["delay_evaluation"], slack_factor=policy_settings["bandit"]["slack_factor"], slack_amount=policy_settings["bandit"]["slack_amount"]) elif "medianstopping" in policy_settings["name"]: policy = MedianStoppingPolicy( evaluation_interval=policy_settings["evaluation_interval"], delay_evaluation=policy_settings["delay_evaluation"]) elif "noterminal" in policy_settings["name"]: policy = NoTerminationPolicy() elif "truncationselection" in policy_settings["name"]: policy = TruncationSelectionPolicy( evaluation_interval=policy_settings["evaluation_interval"], delay_evaluation=policy_settings["delay_evaluation"], truncation_percentage=policy_settings["truncationselection"] ["truncation_percentage"]) else: policy = None return policy
def get_policy(policy_method, evaluation_interval, delay_evaluation, **kwargs): if "bandit" in policy_method.lower(): policy = BanditPolicy( evaluation_interval=evaluation_interval, delay_evaluation=delay_evaluation, slack_factor=kwargs.get("slack_factor", None), slack_amount=kwargs.get("slack_amount", None) ) elif "medianstopping" in policy_method.lower(): policy = MedianStoppingPolicy( evaluation_interval=evaluation_interval, delay_evaluation=delay_evaluation ) elif "notermination" in policy_method.lower(): policy = NoTerminationPolicy() elif "truncationselection" in policy_method.lower(): policy = TruncationSelectionPolicy( evaluation_interval=evaluation_interval, delay_evaluation=delay_evaluation, truncation_percentage=kwargs.get("truncation_percentage", None) ) else: policy = NoTerminationPolicy() return policy
'model_name': choice('maskrcnn_resnet50_fpn'), 'learning_rate': uniform(0.0001, 0.001), #'warmup_cosine_lr_warmup_epochs': choice(0, 3), 'optimizer': choice('sgd', 'adam', 'adamw'), 'min_size': choice(600, 800) } tuning_settings = { 'iterations': 20, 'max_concurrent_iterations': 4, 'hyperparameter_sampling': RandomParameterSampling(parameter_space), 'policy': BanditPolicy(evaluation_interval=2, slack_factor=0.2, delay_evaluation=6) } automl_image_config = AutoMLImageConfig( task='image-instance-segmentation', compute_target=compute_target, training_data=training_dataset, validation_data=validation_dataset, primary_metric='mean_average_precision', **tuning_settings) automl_image_run = experiment.submit(automl_image_config) automl_image_run.wait_for_completion(wait_post_processing=True) # Register the model from the best run
# Now that we've seen how to do a simple PyTorch training run using the SDK, let's see if we can further improve the accuracy of our model. We can optimize our model's hyperparameters using Azure Machine Learning's hyperparameter tuning capabilities. # Start a hyperparameter sweep # First, we will define the hyperparameter space to sweep over. Since our training script uses a learning rate schedule to decay the learning rate every several epochs, let's tune the initial learning rate and the momentum parameters. In this example we will use random sampling to try different configuration sets of hyperparameters to maximize our primary metric, the best validation accuracy (best_val_acc). # Then, we specify the early termination policy to use to early terminate poorly performing runs. Here we use the BanditPolicy, which will terminate any run that doesn't fall within the slack factor of our primary evaluation metric. In this tutorial, we will apply this policy every epoch (since we report our best_val_acc metric every epoch and evaluation_interval=1). Notice we will delay the first policy evaluation until after the first 10 epochs (delay_evaluation=10). Refer here for more information on the BanditPolicy and other policies available. from azureml.train.hyperdrive import RandomParameterSampling, HyperDriveRunConfig, BanditPolicy, PrimaryMetricGoal, uniform param_sampling = RandomParameterSampling( { 'learning_rate': uniform(0.0005, 0.005), 'momentum': uniform(0.9, 0.99) } ) early_termination_policy = BanditPolicy(slack_factor=0.15, evaluation_interval=1, delay_evaluation=10) hyperdrive_run_config = HyperDriveRunConfig(estimator=estimator, hyperparameter_sampling=param_sampling, policy=early_termination_policy, primary_metric_name='best_val_acc', primary_metric_goal=PrimaryMetricGoal.MAXIMIZE, max_total_runs=8, max_concurrent_runs=4) # Finally, lauch the hyperparameter tuning job. # start the HyperDrive run hyperdrive_run = experiment.submit(hyperdrive_run_config) # Monitor HyperDrive runs
def build_pipeline(dataset, ws, config): print("building pipeline for dataset %s in workspace %s" % (dataset, ws.name)) base_dir = '.' def_blob_store = ws.get_default_datastore() # folder for scripts that need to be uploaded to Aml compute target script_folder = './scripts' os.makedirs(script_folder, exist_ok=True) shutil.copy(os.path.join(base_dir, 'video_decoding.py'), script_folder) shutil.copy(os.path.join(base_dir, 'pipelines_submit.py'), script_folder) shutil.copy(os.path.join(base_dir, 'pipelines_create.py'), script_folder) shutil.copy(os.path.join(base_dir, 'train.py'), script_folder) shutil.copy(os.path.join(base_dir, 'data_utils.py'), script_folder) shutil.copy(os.path.join(base_dir, 'prednet.py'), script_folder) shutil.copy(os.path.join(base_dir, 'keras_utils.py'), script_folder) shutil.copy(os.path.join(base_dir, 'data_preparation.py'), script_folder) shutil.copy(os.path.join(base_dir, 'model_registration.py'), script_folder) shutil.copy(os.path.join(base_dir, 'config.json'), script_folder) cpu_compute_name = config['cpu_compute'] try: cpu_compute_target = AmlCompute(ws, cpu_compute_name) print("found existing compute target: %s" % cpu_compute_name) except:# ComputeTargetException: print("creating new compute target") provisioning_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2', max_nodes=4, idle_seconds_before_scaledown=1800) cpu_compute_target = ComputeTarget.create(ws, cpu_compute_name, provisioning_config) cpu_compute_target.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20) # use get_status() to get a detailed status for the current cluster. print(cpu_compute_target.get_status().serialize()) # choose a name for your cluster gpu_compute_name = config['gpu_compute'] try: gpu_compute_target = AmlCompute(workspace=ws, name=gpu_compute_name) print("found existing compute target: %s" % gpu_compute_name) except: print('Creating a new compute target...') provisioning_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_NC6', max_nodes=10, idle_seconds_before_scaledown=1800) # create the cluster gpu_compute_target = ComputeTarget.create(ws, gpu_compute_name, provisioning_config) # can poll for a minimum number of nodes and for a specific timeout. # if no min node count is provided it uses the scale settings for the cluster gpu_compute_target.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20) # use get_status() to get a detailed status for the current cluster. try: print(gpu_compute_target.get_status().serialize()) except BaseException as e: print("Could not get status of compute target.") print(e) # conda dependencies for compute targets cpu_cd = CondaDependencies.create(conda_packages=["py-opencv=3.4.2"], pip_indexurl='https://azuremlsdktestpypi.azureedge.net/sdk-release/Candidate/604C89A437BA41BD942B4F46D9A3591D', pip_packages=["azure-storage-blob==1.5.0", "hickle==3.4.3", "requests==2.21.0", "sklearn", "pandas==0.24.2", "azureml-sdk", "numpy==1.16.2", "pillow==6.0.0"]) # Runconfigs cpu_compute_run_config = RunConfiguration(conda_dependencies=cpu_cd) cpu_compute_run_config.environment.docker.enabled = True cpu_compute_run_config.environment.docker.gpu_support = False cpu_compute_run_config.environment.docker.base_image = DEFAULT_CPU_IMAGE cpu_compute_run_config.environment.spark.precache_packages = False print("PipelineData object created") # DataReference to where video data is stored. video_data = DataReference( datastore=def_blob_store, data_reference_name="video_data", path_on_datastore=os.path.join("prednet", "data", "video", dataset)) print("DataReference object created") # Naming the intermediate data as processed_data1 and assigning it to the variable processed_data1. raw_data = PipelineData("raw_video_fames", datastore=def_blob_store) preprocessed_data = PipelineData("preprocessed_video_frames", datastore=def_blob_store) data_metrics = PipelineData("data_metrics", datastore=def_blob_store) data_output = PipelineData("output_data", datastore=def_blob_store) # prepare dataset for training/testing prednet video_decoding = PythonScriptStep( name='decode_videos', script_name="video_decoding.py", arguments=["--input_data", video_data, "--output_data", raw_data], inputs=[video_data], outputs=[raw_data], compute_target=cpu_compute_target, source_directory=script_folder, runconfig=cpu_compute_run_config, allow_reuse=True, hash_paths=['.'] ) print("video_decode step created") # prepare dataset for training/testing recurrent neural network data_prep = PythonScriptStep( name='prepare_data', script_name="data_preparation.py", arguments=["--input_data", raw_data, "--output_data", preprocessed_data], inputs=[raw_data], outputs=[preprocessed_data], compute_target=cpu_compute_target, source_directory=script_folder, runconfig=cpu_compute_run_config, allow_reuse=True, hash_paths=['.'] ) data_prep.run_after(video_decoding) print("data_prep step created") # configure access to ACR for pulling our custom docker image acr = ContainerRegistry() acr.address = config['acr_address'] acr.username = config['acr_username'] acr.password = config['acr_password'] est = Estimator(source_directory=script_folder, compute_target=gpu_compute_target, entry_script='train.py', use_gpu=True, node_count=1, custom_docker_image = "wopauli_1.8-gpu:1", image_registry_details=acr, user_managed=True ) ps = RandomParameterSampling( { '--batch_size': choice(1, 2, 4, 8), '--filter_sizes': choice("3, 3, 3", "4, 4, 4", "5, 5, 5"), '--stack_sizes': choice("48, 96, 192", "36, 72, 144", "12, 24, 48"), #, "48, 96"), '--learning_rate': loguniform(-6, -1), '--lr_decay': loguniform(-9, -1), '--freeze_layers': choice("0, 1, 2", "1, 2, 3", "0, 1", "1, 2", "2, 3", "0", "3"), '--transfer_learning': choice("True", "False") } ) policy = BanditPolicy(evaluation_interval=2, slack_factor=0.1, delay_evaluation=10) hdc = HyperDriveConfig(estimator=est, hyperparameter_sampling=ps, policy=policy, primary_metric_name='val_loss', primary_metric_goal=PrimaryMetricGoal.MINIMIZE, max_total_runs=10, max_concurrent_runs=5, max_duration_minutes=60*6 ) hd_step = HyperDriveStep( name="train_w_hyperdrive", hyperdrive_run_config=hdc, estimator_entry_script_arguments=[ '--data-folder', preprocessed_data, '--remote_execution', '--dataset', dataset ], inputs=[preprocessed_data], metrics_output = data_metrics, allow_reuse=True ) hd_step.run_after(data_prep) registration_step = PythonScriptStep( name='register_model', script_name='model_registration.py', arguments=['--input_dir', data_metrics, '--output_dir', data_output], compute_target=cpu_compute_target, inputs=[data_metrics], outputs=[data_output], source_directory=script_folder, allow_reuse=True, hash_paths=['.'] ) registration_step.run_after(hd_step) pipeline = Pipeline(workspace=ws, steps=[video_decoding, data_prep, hd_step, registration_step]) print ("Pipeline is built") pipeline.validate() print("Simple validation complete") pipeline_name = 'prednet_' + dataset published_pipeline = pipeline.publish(name=pipeline_name) schedule = Schedule.create(workspace=ws, name=pipeline_name + "_sch", pipeline_id=published_pipeline.id, experiment_name=pipeline_name, datastore=def_blob_store, wait_for_provisioning=True, description="Datastore scheduler for Pipeline" + pipeline_name, path_on_datastore=os.path.join('prednet/data/video', dataset, 'Train'), polling_interval=1 ) return pipeline_name
from azureml.train.hyperdrive import RandomParameterSampling, BanditPolicy, HyperDriveConfig, PrimaryMetricGoal from azureml.pipeline.steps import HyperDriveStep from azureml.train.hyperdrive import choice, loguniform, uniform ps = RandomParameterSampling({ '--learning_rate': uniform(1e-3, 2e-2), '--momentum': uniform(.1, .95), '--weight_decay': loguniform(-5, -3), '--temperature': uniform(1, 9), # '--lambda_const': uniform(.1, .3), '--transfer_learning': choice("True", "False") }) policy = BanditPolicy(evaluation_interval=2, slack_factor=0.1, delay_evaluation=10) hdc = HyperDriveConfig( estimator=est, hyperparameter_sampling=ps, policy=policy, primary_metric_name='val_loss', primary_metric_goal=PrimaryMetricGoal.MINIMIZE, max_total_runs=5, #100, max_concurrent_runs=5) hd_step = HyperDriveStep( name="train_w_hyperdrive", hyperdrive_config=hdc, estimator_entry_script_arguments=[
print() # start the job run = sk_est.fit() print(helpers.get_run_history_url(run)) run.wait_for_completion(show_output=True) print('configure hyperdrive.') # parameter space to sweep over ps = RandomParameterSampling({"alpha": uniform(0.0, 1.0)}) # early termniation policy # check every 2 iterations and if the primary metric (epoch_val_acc) falls # outside of the range of 10% of the best recorded run so far, terminate it. etp = BanditPolicy(slack_factor=0.1, evaluation_interval=2) # Hyperdrive run configuration hrc = HyperDriveRunConfig( ".", estimator=sk_est, hyperparameter_sampling=ps, policy=etp, # metric to watch (for early termination) primary_metric_name='mse', # terminate if metric falls below threshold primary_metric_goal=PrimaryMetricGoal.MINIMIZE, max_total_runs=20, max_concurrent_runs=4, compute_target=compute_target)
def build_pipeline(dataset, ws, config): print("building pipeline for dataset %s in workspace %s" % (dataset, ws.name)) hostname = socket.gethostname() if hostname == 'wopauliNC6': base_dir = '.' else: base_dir = '.' def_blob_store = ws.get_default_datastore() # folder for scripts that need to be uploaded to Aml compute target script_folder = './scripts' os.makedirs(script_folder, exist_ok=True) shutil.copy(os.path.join(base_dir, 'video_decoding.py'), script_folder) shutil.copy(os.path.join(base_dir, 'pipelines_submit.py'), script_folder) shutil.copy(os.path.join(base_dir, 'pipelines_build.py'), script_folder) shutil.copy(os.path.join(base_dir, 'train.py'), script_folder) shutil.copy(os.path.join(base_dir, 'data_utils.py'), script_folder) shutil.copy(os.path.join(base_dir, 'prednet.py'), script_folder) shutil.copy(os.path.join(base_dir, 'keras_utils.py'), script_folder) shutil.copy(os.path.join(base_dir, 'data_preparation.py'), script_folder) shutil.copy(os.path.join(base_dir, 'model_registration.py'), script_folder) shutil.copy(os.path.join(base_dir, 'config.json'), script_folder) cpu_compute_name = config['cpu_compute'] try: cpu_compute_target = AmlCompute(ws, cpu_compute_name) print("found existing compute target: %s" % cpu_compute_name) except ComputeTargetException: print("creating new compute target") provisioning_config = AmlCompute.provisioning_configuration( vm_size='STANDARD_D2_V2', max_nodes=4, idle_seconds_before_scaledown=1800) cpu_compute_target = ComputeTarget.create(ws, cpu_compute_name, provisioning_config) cpu_compute_target.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20) # use get_status() to get a detailed status for the current cluster. print(cpu_compute_target.get_status().serialize()) # choose a name for your cluster gpu_compute_name = config['gpu_compute'] try: gpu_compute_target = AmlCompute(workspace=ws, name=gpu_compute_name) print("found existing compute target: %s" % gpu_compute_name) except ComputeTargetException: print('Creating a new compute target...') provisioning_config = AmlCompute.provisioning_configuration( vm_size='STANDARD_NC6', max_nodes=5, idle_seconds_before_scaledown=1800) # create the cluster gpu_compute_target = ComputeTarget.create(ws, gpu_compute_name, provisioning_config) # can poll for a minimum number of nodes and for a specific timeout. # if no min node count is provided it uses the scale settings for the cluster gpu_compute_target.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20) # use get_status() to get a detailed status for the current cluster. print(gpu_compute_target.get_status().serialize()) # conda dependencies for compute targets cpu_cd = CondaDependencies.create(conda_packages=["py-opencv=3.4.2"], pip_packages=[ "azure-storage-blob==1.5.0", "hickle==3.4.3", "requests==2.21.0", "sklearn", "pandas==0.24.2", "azureml-sdk==1.0.21", "numpy==1.16.2", "pillow==6.0.0" ]) gpu_cd = CondaDependencies.create(pip_packages=[ "keras==2.0.8", "theano==1.0.4", "tensorflow==1.8.0", "tensorflow-gpu==1.8.0", "hickle==3.4.3", "matplotlib==3.0.3", "seaborn==0.9.0", "requests==2.21.0", "bs4==0.0.1", "imageio==2.5.0", "sklearn", "pandas==0.24.2", "azureml-sdk==1.0.21", "numpy==1.16.2" ]) # Runconfigs cpu_compute_run_config = RunConfiguration(conda_dependencies=cpu_cd) cpu_compute_run_config.environment.docker.enabled = True cpu_compute_run_config.environment.docker.gpu_support = False cpu_compute_run_config.environment.docker.base_image = DEFAULT_CPU_IMAGE cpu_compute_run_config.environment.spark.precache_packages = False gpu_compute_run_config = RunConfiguration(conda_dependencies=gpu_cd) gpu_compute_run_config.environment.docker.enabled = True gpu_compute_run_config.environment.docker.gpu_support = True gpu_compute_run_config.environment.docker.base_image = DEFAULT_GPU_IMAGE gpu_compute_run_config.environment.spark.precache_packages = False print("PipelineData object created") video_data = DataReference(datastore=def_blob_store, data_reference_name="video_data", path_on_datastore=os.path.join( "prednet", "data", "video", dataset)) # Naming the intermediate data as processed_data1 and assigning it to the variable processed_data1. raw_data = PipelineData("raw_video_fames", datastore=def_blob_store) preprocessed_data = PipelineData("preprocessed_video_frames", datastore=def_blob_store) data_metrics = PipelineData("data_metrics", datastore=def_blob_store) data_output = PipelineData("output_data", datastore=def_blob_store) print("DataReference object created") # prepare dataset for training/testing prednet video_decoding = PythonScriptStep( name='decode_videos', script_name="video_decoding.py", arguments=["--input_data", video_data, "--output_data", raw_data], inputs=[video_data], outputs=[raw_data], compute_target=cpu_compute_target, source_directory=script_folder, runconfig=cpu_compute_run_config, allow_reuse=True, hash_paths=['.']) print("video_decode created") # prepare dataset for training/testing recurrent neural network data_prep = PythonScriptStep(name='prepare_data', script_name="data_preparation.py", arguments=[ "--input_data", raw_data, "--output_data", preprocessed_data ], inputs=[raw_data], outputs=[preprocessed_data], compute_target=cpu_compute_target, source_directory=script_folder, runconfig=cpu_compute_run_config, allow_reuse=True, hash_paths=['.']) data_prep.run_after(video_decoding) print("data_prep created") est = TensorFlow(source_directory=script_folder, compute_target=gpu_compute_target, pip_packages=[ 'keras==2.0.8', 'theano', 'tensorflow==1.8.0', 'tensorflow-gpu==1.8.0', 'matplotlib', 'horovod', 'hickle' ], entry_script='train.py', use_gpu=True, node_count=1) ps = RandomParameterSampling({ '--batch_size': choice(2, 4, 8, 16), '--filter_sizes': choice("3, 3, 3", "4, 4, 4", "5, 5, 5"), '--stack_sizes': choice("48, 96, 192", "36, 72, 144", "12, 24, 48"), #, "48, 96"), '--learning_rate': loguniform(-6, -1), '--lr_decay': loguniform(-9, -1), '--freeze_layers': choice("0, 1, 2", "1, 2, 3", "0, 1", "1, 2", "2, 3", "0", "1", "2", "3"), '--transfer_learning': choice("True", "False") }) policy = BanditPolicy(evaluation_interval=2, slack_factor=0.1, delay_evaluation=20) hdc = HyperDriveRunConfig( estimator=est, hyperparameter_sampling=ps, policy=policy, primary_metric_name='val_loss', primary_metric_goal=PrimaryMetricGoal.MINIMIZE, max_total_runs=5, #100, max_concurrent_runs=5, #10, max_duration_minutes=60 * 6) hd_step = HyperDriveStep(name="train_w_hyperdrive", hyperdrive_run_config=hdc, estimator_entry_script_arguments=[ '--data-folder', preprocessed_data, '--remote_execution' ], inputs=[preprocessed_data], metrics_output=data_metrics, allow_reuse=True) hd_step.run_after(data_prep) registration_step = PythonScriptStep( name='register_model', script_name='model_registration.py', arguments=['--input_dir', data_metrics, '--output_dir', data_output], compute_target=gpu_compute_target, inputs=[data_metrics], outputs=[data_output], source_directory=script_folder, allow_reuse=True, hash_paths=['.']) registration_step.run_after(hd_step) pipeline = Pipeline( workspace=ws, steps=[video_decoding, data_prep, hd_step, registration_step]) print("Pipeline is built") pipeline.validate() print("Simple validation complete") pipeline_name = 'prednet_' + dataset pipeline.publish(name=pipeline_name) return pipeline_name
os.makedirs(experiment_folder, exist_ok=True) print("Experiment:", experiment.name) #Fetch GPU cluster for computations gpu_cluster = ComputeTarget(workspace=ws, name='demo-GPU-cluster') # Sample a range of parameter values params = GridParameterSampling({ # There's only one parameter, so grid sampling will try each value - with multiple parameters it would try every combination '--regularization': choice(0.001, 0.005, 0.01, 0.05, 0.1, 1.0) }) # Set evaluation policy to stop poorly performing training runs early policy = BanditPolicy(evaluation_interval=2, slack_factor=0.1) # Get the training dataset diabetes_ds = ws.datasets.get("diabetes_dataset") # Create an estimator that uses the remote compute hyper_estimator = SKLearn( source_directory=experiment_folder, inputs=[diabetes_ds.as_named_input('diabetes') ], # Pass the dataset as an input compute_target=gpu_cluster, conda_packages=['pandas', 'ipykernel', 'matplotlib'], pip_packages=['azureml-sdk', 'argparse', 'pyarrow'], entry_script='diabetes_training.py') # Configure hyperdrive settings
def get_bandit_policy(self): return BanditPolicy(evaluation_interval=EVALUATION_INTERVAL, slack_factor=SLACK_FACTOR)
entry_script=PathsConfig.entry_script, use_gpu=True, custom_docker_image=settings["IMAGE_NAME"], ) if GeneralConfig.hyperdrive: if GeneralConfig.architecture_type == "PretrainedResNet50": hyperparams_space = HyperdriveConfig.pretrained_resnet50_hyperparams_space else: raise NotImplementedError hyperparams_space_format = { parameter: choice(parameter_range) for parameter, parameter_range in hyperparams_space.items() } parameters_sampling = RandomParameterSampling(hyperparams_space_format) policy = BanditPolicy( evaluation_interval=HyperdriveConfig.evaluation_interval, slack_factor=HyperdriveConfig.slack_factor, ) hdc = HyperDriveConfig( estimator=est, hyperparameter_sampling=parameters_sampling, policy=policy, primary_metric_name="Accuracy", primary_metric_goal=PrimaryMetricGoal.MAXIMIZE, max_total_runs=HyperdriveConfig.max_total_runs, max_concurrent_runs=HyperdriveConfig.max_concurrent_runs, ) run = exp.submit(hdc) else: run = exp.submit(est)
#normal, uniform, lognormal, loguniform #for different type of sampling we need parameter as per there nature # 1. GridSearch - parametres range should be discrete values # 2. RandomSearch - params can be a mix of discrete and continuous values # 3. Baysian Search - parms can be choice, uniform, quniform and can be combined with early stopping policy # Early Stopping Policies # 1. Bandit policy: stop the run if the target metric performance underperforms the best run so far by a specified margin # for example - if the metric is 0.2 or more worse than the best then from azureml.train.hyperdrive import BanditPolicy early_termination_policy = BanditPolicy(slack_amount=0.2, evaluation_interval=1, delay_evaluation=5) # 2. Median Stopping policy: abandons runs where the target performance metric is worse than the median of th erunning averages for all the runs from azureml.train.hyperdrive import MedianStoppingPolicy early_termination_policy = MedianStoppingPolicy(evaluation_interval=1, delay_evaluation=5) # 3. truncation selection policy: it cancels the lower performing X% of the runs at each evaluatin interval based on the trunc_perc value you specify for X.