Exemplo n.º 1
0
            "--disable_tqdm",
            True,
        ],
        compute_target=target,
        environment=env,
    )

    # set up hyperdrive search space
    convert_base = lambda x: float(np.log(x))
    search_space = {
        "--learning_rate":
        hyperdrive.loguniform(
            convert_base(1e-6),
            convert_base(5e-2)),  # NB. loguniform on [exp(min), exp(max)]
        "--weight_decay":
        hyperdrive.uniform(5e-3, 15e-2),
        "--per_device_train_batch_size":
        hyperdrive.choice([16, 32]),
    }

    hyperparameter_sampling = RandomParameterSampling(search_space)

    policy = TruncationSelectionPolicy(truncation_percentage=50,
                                       evaluation_interval=2,
                                       delay_evaluation=0)

    hyperdrive_config = HyperDriveConfig(
        run_config=config,
        hyperparameter_sampling=hyperparameter_sampling,
        policy=policy,
        primary_metric_name="eval_matthews_correlation",
Exemplo n.º 2
0
    validation_data=validation_dataset,
    hyperparameter_sampling=GridParameterSampling(
        {'model_name': choice('maskrcnn_resnet50_fpn')}))

automl_image_run = experiment.submit(image_config_maskrcnn)

automl_image_run.wait_for_completion(wait_post_processing=True)

from azureml.train.automl import AutoMLImageConfig
from azureml.train.hyperdrive import GridParameterSampling, RandomParameterSampling, BayesianParameterSampling
from azureml.train.hyperdrive import BanditPolicy, HyperDriveConfig, PrimaryMetricGoal
from azureml.train.hyperdrive import choice, uniform

parameter_space = {
    'model_name': choice('maskrcnn_resnet50_fpn'),
    'learning_rate': uniform(0.0001, 0.001),
    #'warmup_cosine_lr_warmup_epochs': choice(0, 3),
    'optimizer': choice('sgd', 'adam', 'adamw'),
    'min_size': choice(600, 800)
}

tuning_settings = {
    'iterations':
    20,
    'max_concurrent_iterations':
    4,
    'hyperparameter_sampling':
    RandomParameterSampling(parameter_space),
    'policy':
    BanditPolicy(evaluation_interval=2, slack_factor=0.2, delay_evaluation=6)
}
Exemplo n.º 3
0
                compute_target=gpu_compute_target,
                pip_packages=[
                    'keras', 'tensorflow', 'tensorflow-gpu', 'matplotlib',
                    'pillow', 'six', 'numpy', 'azureml-sdk', 'tqdm'
                ],
                conda_packages=['cudatoolkit=10.0.130'],
                entry_script='kd_squeezenet.py',
                use_gpu=True,
                node_count=1)

from azureml.train.hyperdrive import RandomParameterSampling, BanditPolicy, HyperDriveConfig, PrimaryMetricGoal
from azureml.pipeline.steps import HyperDriveStep
from azureml.train.hyperdrive import choice, loguniform, uniform

ps = RandomParameterSampling({
    '--learning_rate': uniform(1e-3, 2e-2),
    '--momentum': uniform(.1, .95),
    '--weight_decay': loguniform(-5, -3),
    '--temperature': uniform(1, 9),
    # '--lambda_const': uniform(.1, .3),
    '--transfer_learning': choice("True", "False")
})

policy = BanditPolicy(evaluation_interval=2,
                      slack_factor=0.1,
                      delay_evaluation=10)

hdc = HyperDriveConfig(
    estimator=est,
    hyperparameter_sampling=ps,
    policy=policy,
                    script_params=script_params,
                    compute_target=compute_target,
                    entry_script='pytorch_train.py',
                    use_gpu=True)

# Now that we've seen how to do a simple PyTorch training run using the SDK, let's see if we can further improve the accuracy of our model. We can optimize our model's hyperparameters using Azure Machine Learning's hyperparameter tuning capabilities.
# Start a hyperparameter sweep

# First, we will define the hyperparameter space to sweep over. Since our training script uses a learning rate schedule to decay the learning rate every several epochs, let's tune the initial learning rate and the momentum parameters. In this example we will use random sampling to try different configuration sets of hyperparameters to maximize our primary metric, the best validation accuracy (best_val_acc).

# Then, we specify the early termination policy to use to early terminate poorly performing runs. Here we use the BanditPolicy, which will terminate any run that doesn't fall within the slack factor of our primary evaluation metric. In this tutorial, we will apply this policy every epoch (since we report our best_val_acc metric every epoch and evaluation_interval=1). Notice we will delay the first policy evaluation until after the first 10 epochs (delay_evaluation=10). Refer here for more information on the BanditPolicy and other policies available.

from azureml.train.hyperdrive import RandomParameterSampling, HyperDriveRunConfig, BanditPolicy, PrimaryMetricGoal, uniform

param_sampling = RandomParameterSampling( {
        'learning_rate': uniform(0.0005, 0.005),
        'momentum': uniform(0.9, 0.99)
    }
)

early_termination_policy = BanditPolicy(slack_factor=0.15, evaluation_interval=1, delay_evaluation=10)

hyperdrive_run_config = HyperDriveRunConfig(estimator=estimator,
                                            hyperparameter_sampling=param_sampling, 
                                            policy=early_termination_policy,
                                            primary_metric_name='best_val_acc',
                                            primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
                                            max_total_runs=8,
                                            max_concurrent_runs=4)

# Finally, lauch the hyperparameter tuning job.
Exemplo n.º 5
0
def build_prednet_pipeline(dataset, ws):
    print("building pipeline for dataset %s in workspace %s" %
          (dataset, ws.name))

    base_dir = "."

    def_blob_store = ws.get_default_datastore()

    # folder for scripts that need to be uploaded to Aml compute target
    script_folder = "./scripts"
    os.makedirs(script_folder)

    shutil.copytree(os.path.join(base_dir, "models"),
                    os.path.join(base_dir, script_folder, "models"))
    shutil.copy(os.path.join(base_dir, "train.py"), script_folder)
    shutil.copy(os.path.join(base_dir, "data_preparation.py"), script_folder)
    shutil.copy(os.path.join(base_dir, "register_prednet.py"), script_folder)
    shutil.copy(os.path.join(base_dir, "batch_scoring.py"), script_folder)
    shutil.copy(os.path.join(base_dir, "train_clf.py"), script_folder)
    shutil.copy(os.path.join(base_dir, "register_clf.py"), script_folder)

    cpu_compute_name = args.cpu_compute_name
    cpu_compute_target = AmlCompute(ws, cpu_compute_name)
    print("found existing compute target: %s" % cpu_compute_name)

    # use get_status() to get a detailed status for the current cluster.
    print(cpu_compute_target.get_status().serialize())

    # choose a name for your cluster
    gpu_compute_name = args.gpu_compute_name

    gpu_compute_target = AmlCompute(workspace=ws, name=gpu_compute_name)
    print(gpu_compute_target.get_status().serialize())

    env = Environment.get(ws, "prednet")

    # Runconfigs
    runconfig = RunConfiguration()
    runconfig.environment = env
    print("PipelineData object created")

    # DataReference to where raw data is stored.
    raw_data = DataReference(
        datastore=def_blob_store,
        data_reference_name="raw_data",
        path_on_datastore=os.path.join("prednet", "data", "raw_data"),
    )
    print("DataReference object created")

    # Naming the intermediate data as processed_data and assigning it to the
    # variable processed_data.
    preprocessed_data = PipelineData("preprocessed_data",
                                     datastore=def_blob_store)
    data_metrics = PipelineData("data_metrics", datastore=def_blob_store)
    hd_child_cwd = PipelineData("prednet_model_path", datastore=def_blob_store)
    # prednet_path = PipelineData("outputs", datastore=def_blob_store)
    scored_data = PipelineData("scored_data", datastore=def_blob_store)
    model_path = PipelineData("model_path", datastore=def_blob_store)

    # prepare dataset for training/testing recurrent neural network
    data_prep = PythonScriptStep(
        name="prepare_data",
        script_name="data_preparation.py",
        arguments=[
            "--raw_data",
            raw_data,
            "--preprocessed_data",
            preprocessed_data,
            "--dataset",
            dataset,
        ],
        inputs=[raw_data],
        outputs=[preprocessed_data],
        compute_target=cpu_compute_target,
        source_directory=script_folder,
        runconfig=runconfig,
        allow_reuse=True,
    )
    # data_prep.run_after(video_decoding)

    print("data_prep step created")

    est = Estimator(
        source_directory=script_folder,
        compute_target=gpu_compute_target,
        entry_script="train.py",
        node_count=1,
        environment_definition=env,
    )

    ps = BayesianParameterSampling({
        "--batch_size":
        choice(1, 2, 4, 10),
        "--filter_sizes":
        choice("3, 3, 3", "4, 4, 4", "5, 5, 5"),
        "--stack_sizes":
        choice("48, 96, 192", "36, 72, 144", "12, 24, 48"),
        "--learning_rate":
        uniform(1e-6, 1e-3),
        "--lr_decay":
        uniform(1e-9, 1e-2),
        "--freeze_layers":
        choice("0, 1, 2", "1, 2, 3", "0, 1", "1, 2", "2, 3", "0", "3"),
        # "--fine_tuning": choice("True", "False"),
    })

    hdc = HyperDriveConfig(
        estimator=est,
        hyperparameter_sampling=ps,
        primary_metric_name="val_loss",
        primary_metric_goal=PrimaryMetricGoal.MINIMIZE,
        max_total_runs=3,
        max_concurrent_runs=3,
        max_duration_minutes=60 * 6,
    )

    train_prednet = HyperDriveStep(
        "train_w_hyperdrive",
        hdc,
        estimator_entry_script_arguments=[
            "--preprocessed_data",
            preprocessed_data,
            "--remote_execution",
            "--dataset",
            dataset,
        ],
        inputs=[preprocessed_data],
        outputs=[hd_child_cwd],
        metrics_output=data_metrics,
        allow_reuse=True,
    )
    train_prednet.run_after(data_prep)

    register_prednet = PythonScriptStep(
        name="register_prednet",
        script_name="register_prednet.py",
        arguments=[
            "--data_metrics",
            data_metrics,
        ],
        compute_target=cpu_compute_target,
        inputs=[data_metrics, hd_child_cwd],
        source_directory=script_folder,
        allow_reuse=True,
    )
    register_prednet.run_after(train_prednet)

    batch_scoring = PythonScriptStep(
        name="batch_scoring",
        script_name="batch_scoring.py",
        arguments=[
            "--preprocessed_data",
            preprocessed_data,
            "--scored_data",
            scored_data,
            "--dataset",
            dataset,
            # "--prednet_path",
            # prednet_path
        ],
        compute_target=gpu_compute_target,
        inputs=[preprocessed_data],
        outputs=[scored_data],
        source_directory=script_folder,
        runconfig=runconfig,
        allow_reuse=True,
    )
    batch_scoring.run_after(register_prednet)

    train_clf = PythonScriptStep(
        name="train_clf",
        script_name="train_clf.py",
        arguments=[
            "--preprocessed_data", preprocessed_data, "--scored_data",
            scored_data, "--model_path", model_path
        ],
        compute_target=cpu_compute_target,
        inputs=[preprocessed_data, scored_data],
        outputs=[model_path],
        source_directory=script_folder,
        runconfig=runconfig,
        allow_reuse=True,
    )
    train_clf.run_after(batch_scoring)

    register_clf = PythonScriptStep(
        name="register_clf",
        script_name="register_clf.py",
        arguments=["--model_path", model_path],
        inputs=[model_path],
        compute_target=cpu_compute_target,
        source_directory=script_folder,
        allow_reuse=True,
        runconfig=runconfig,
    )
    register_clf.run_after(train_clf)

    pipeline = Pipeline(
        workspace=ws,
        steps=[
            data_prep,
            train_prednet,
            register_prednet,
            batch_scoring,
            train_clf,
            register_clf,
        ],
    )
    pipeline.validate()

    pipeline_name = "prednet_" + dataset
    published_pipeline = pipeline.publish(name=pipeline_name)

    _ = Schedule.create(
        workspace=ws,
        name=pipeline_name + "_sch",
        pipeline_id=published_pipeline.id,
        experiment_name=pipeline_name,
        datastore=def_blob_store,
        wait_for_provisioning=True,
        description="Datastore scheduler for Pipeline" + pipeline_name,
        path_on_datastore=os.path.join("prednet/data/raw_data", dataset,
                                       "Train"),
        polling_interval=60 * 24,
    )

    published_pipeline.submit(ws, pipeline_name)
Exemplo n.º 6
0
print()
print('##################################################')
print('Batch AI run...')
print('##################################################')
print()

# start the job
run = sk_est.fit()
print(helpers.get_run_history_url(run))
run.wait_for_completion(show_output=True)

print('configure hyperdrive.')

# parameter space to sweep over
ps = RandomParameterSampling({"alpha": uniform(0.0, 1.0)})

# early termniation policy
# check every 2 iterations and if the primary metric (epoch_val_acc) falls
# outside of the range of 10% of the best recorded run so far, terminate it.
etp = BanditPolicy(slack_factor=0.1, evaluation_interval=2)

# Hyperdrive run configuration
hrc = HyperDriveRunConfig(
    ".",
    estimator=sk_est,
    hyperparameter_sampling=ps,
    policy=etp,
    # metric to watch (for early termination)
    primary_metric_name='mse',
    # terminate if metric falls below threshold
Exemplo n.º 7
0
        print(f'HyperParams.Created. version = {VERSION}')

    def get_param_sampling(self):
        return RandomParameterSampling(PARAMETER_SAMPLING)

    def get_bandit_policy(self):
        return BanditPolicy(evaluation_interval=EVALUATION_INTERVAL,
                            slack_factor=SLACK_FACTOR)

    def get_hd_config(self, config):
        hd_config = HyperDriveConfig(
            run_config=config,
            hyperparameter_sampling=self.get_param_sampling(),
            policy=self.get_bandit_policy(),
            primary_metric_name=PRIMARY_METRIC_NAME,
            primary_metric_goal=PRIMARY_METRIC_GOAL,
            max_total_runs=MAX_TOTAL_RUNS,
            max_concurrent_runs=MAX_CONCURRENT_RUNS)
        return hd_config


# https://docs.microsoft.com/azure/machine-learning/how-to-tune-hyperparameters
VERSION = '2021.02.26'
PARAMETER_SAMPLING = {'--alpha': uniform(0.05, 1.0)}
PRIMARY_METRIC_NAME = 'r2'
PRIMARY_METRIC_GOAL = PrimaryMetricGoal.MAXIMIZE
EVALUATION_INTERVAL = 2
SLACK_FACTOR = 0.1
MAX_TOTAL_RUNS = 4
MAX_CONCURRENT_RUNS = 2
Exemplo n.º 8
0
                                    0.01, '--min_data_in_leaf', 50,
                                    '--lambda_l1', 20, '--lambda_l2', 20,
                                    '--n_estimators', 1000
                                ],
                                environment=sklearn_env,
                                compute_target=training_cluster)

# Sample a range of parameter values
params = BayesianParameterSampling({
    # There's only one parameter, so grid sampling will try each value - with multiple parameters it would try every combination
    '--max_depth':
    choice(list(range(2, 20))),
    '--num_leaves':
    choice(list(range(6, 251))),
    '--subsample':
    uniform(0.5, 1),
    '--learning_rate':
    uniform(0.005, 0.25),
    '--min_data_in_leaf':
    choice(list(range(2, 501))),
    '--lambda_l1':
    choice(list(range(201))),
    '--lambda_l2':
    choice(list(range(201))),
    '--n_estimators':
    choice(list(range(100, 4001, 100)))
})

# Configure hyperdrive settings
hyperdrive = HyperDriveConfig(run_config=script_config,
                              hyperparameter_sampling=params,
                    compute_target=compute_target,
                    entry_script='pytorch_train.py',
                    use_gpu=True)

# Now that we've seen how to do a simple PyTorch training run using the SDK, let's see if we can further improve the accuracy of our model. We can optimize our model's hyperparameters using Azure Machine Learning's hyperparameter tuning capabilities.
# Start a hyperparameter sweep

# First, we will define the hyperparameter space to sweep over. Since our training script uses a learning rate schedule to decay the learning rate every several epochs, let's tune the initial learning rate and the momentum parameters. In this example we will use random sampling to try different configuration sets of hyperparameters to maximize our primary metric, the best validation accuracy (best_val_acc).

# Then, we specify the early termination policy to use to early terminate poorly performing runs. Here we use the BanditPolicy, which will terminate any run that doesn't fall within the slack factor of our primary evaluation metric. In this tutorial, we will apply this policy every epoch (since we report our best_val_acc metric every epoch and evaluation_interval=1). Notice we will delay the first policy evaluation until after the first 10 epochs (delay_evaluation=10). Refer here for more information on the BanditPolicy and other policies available.

from azureml.train.hyperdrive import RandomParameterSampling, HyperDriveRunConfig, BanditPolicy, PrimaryMetricGoal, uniform

param_sampling = RandomParameterSampling({
    'learning_rate':
    uniform(0.0005, 0.005),
    'momentum':
    uniform(0.9, 0.99)
})

early_termination_policy = BanditPolicy(slack_factor=0.15,
                                        evaluation_interval=1,
                                        delay_evaluation=10)

hyperdrive_run_config = HyperDriveRunConfig(
    estimator=estimator,
    hyperparameter_sampling=param_sampling,
    policy=early_termination_policy,
    primary_metric_name='best_val_acc',
    primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
    max_total_runs=8,
Exemplo n.º 10
0
experiment_hd = Experiment(workspace=ws, name='hyperdrive')

#%% [markdown]

###### Create Random Parameter Sampler

#%%

# Parameter space to sweep over - uses Random Parameter Sampling
ps = hd.RandomParameterSampling({
    '--network-name':
    hd.choice('densenet201', 'resnet152', 'resnet34', 'alexnet', 'vgg19_bn'),
    '--minibatch-size':
    hd.choice(8, 16),
    '--learning-rate':
    hd.uniform(0.00001, 0.001),
    '--step-size':
    hd.choice(10, 25, 50),  # How often should the learning rate decay
    '--gamma':
    hd.uniform(
        0.7, 0.99
    ),  # The decay applied to the learning rate every {step-size} steps
    '--optimizer-type':
    hd.choice('sgd', 'adam')
})

#%% [markdown]

###### Create Early Termination Policy

#%%
Exemplo n.º 11
0
from azureml.train.hyperdrive import RandomParameterSampling, choice, normal

param_space = {
                 '--batch_size': choice(16, 32, 64),
                 '--learning_rate': normal(10, 3)
              }

param_sampling = RandomParameterSampling(param_space)


#Bayesian sampling
from azureml.train.hyperdrive import BayesianParameterSampling, choice, uniform

param_space = {
                 '--batch_size': choice(16, 32, 64),
                 '--learning_rate': uniform(0.5, 0.1)
              }

param_sampling = BayesianParameterSampling(param_space)


#bandit policy
from azureml.train.hyperdrive import BanditPolicy

early_termination_policy = BanditPolicy(slack_amount = 0.2,
                                        evaluation_interval=1,
                                        delay_evaluation=5)


#median stopping
from azureml.train.hyperdrive import MedianStoppingPolicy