示例#1
0
    def create_hyperdrive_trainer(self, estimator, hd_dict, search_type,
                                  metric_name, maximize_metric,
                                  early_term_policy, max_total_runs,
                                  max_concurrent_runs, max_minutes):

        from azureml.train.hyperdrive import RandomParameterSampling, GridParameterSampling, BayesianParameterSampling

        if search_type == "random":
            ps = RandomParameterSampling(hd_dict)
        elif search_type == "grid":
            ps = GridParameterSampling(hd_dict)
        elif search_type == "bayesian":
            ps = BayesianParameterSampling(hd_dict)
        else:
            errors.config_error(
                "Azure ML Hyperdrive search_type not supported: " +
                search_type)

        max_concurrent_runs = min(max_total_runs, max_concurrent_runs)

        from azureml.train.hyperdrive import HyperDriveConfig, PrimaryMetricGoal

        trainer = HyperDriveConfig(
            estimator=estimator,
            hyperparameter_sampling=ps,
            policy=early_term_policy,
            primary_metric_name=metric_name,
            primary_metric_goal=PrimaryMetricGoal.MAXIMIZE
            if maximize_metric else PrimaryMetricGoal.MINIMIZE,
            max_total_runs=max_total_runs,
            max_concurrent_runs=max_concurrent_runs,
            max_duration_minutes=max_minutes)

        return trainer
    def get_parameter_search_hyperdrive_config(
            self, estimator: Estimator) -> HyperDriveConfig:
        """
        Specify an Azure HyperDrive configuration.
        Further details are described in the tutorial
        https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-tune-hyperparameters
        A reference is provided at https://docs.microsoft.com/en-us/python/api/azureml-train-core/azureml.train
        .hyperdrive?view=azure-ml-py
        :param estimator: The estimator (configured PyTorch environment) of the experiment.
        :return: An Azure HyperDrive run configuration (configured PyTorch environment).
        """
        parameter_space = {'l_rate': uniform(0.0005, 0.01)}

        param_sampling = RandomParameterSampling(parameter_space)

        # early terminate poorly performing runs
        early_termination_policy = BanditPolicy(slack_factor=0.15,
                                                evaluation_interval=1,
                                                delay_evaluation=10)

        return HyperDriveConfig(
            estimator=estimator,
            hyperparameter_sampling=param_sampling,
            policy=early_termination_policy,
            primary_metric_name=TrackedMetrics.Val_Loss.value,
            primary_metric_goal=PrimaryMetricGoal.MINIMIZE,
            max_total_runs=10,
            max_concurrent_runs=2)
示例#3
0
def _create_dummy_hyperdrive_param_search_config(
        estimator: Estimator) -> HyperDriveConfig:
    return HyperDriveConfig(estimator=estimator,
                            hyperparameter_sampling=RandomParameterSampling(
                                {'l_rate': uniform(0.0005, 0.01)}),
                            primary_metric_name=TrackedMetrics.Val_Loss.value,
                            primary_metric_goal=PrimaryMetricGoal.MINIMIZE,
                            max_total_runs=HYPERDRIVE_TOTAL_RUNS)
示例#4
0
 def get_hd_config(self, config):
     hd_config = HyperDriveConfig(
         run_config=config,
         hyperparameter_sampling=self.get_param_sampling(),
         policy=self.get_bandit_policy(),
         primary_metric_name=PRIMARY_METRIC_NAME,
         primary_metric_goal=PRIMARY_METRIC_GOAL,
         max_total_runs=MAX_TOTAL_RUNS,
         max_concurrent_runs=MAX_CONCURRENT_RUNS)
     return hd_config
示例#5
0
def main(epochs, iterations, compute_target, concurrent_runs):
    cli_auth = AzureCliAuthentication()

    experiment = Experiment.from_directory(".", auth=cli_auth)
    ws = experiment.workspace

    cluster = ws.compute_targets[compute_target]
    food_data = ws.datastores['food_images']

    script_arguments = {"--data-dir": food_data.as_mount(), "--epochs": epochs}

    tf_est = TensorFlow(source_directory=".",
                        entry_script='code/train/train.py',
                        script_params=script_arguments,
                        compute_target=cluster,
                        conda_packages=['pillow', 'pandas'],
                        pip_packages=['click', 'seaborn'],
                        use_docker=True,
                        use_gpu=True,
                        framework_version='1.13')

    # Run on subset of food categories
    tf_est.run_config.arguments.extend(
        ['apple_pie', 'baby_back_ribs', 'baklava', 'beef_carpaccio'])

    param_sampler = RandomParameterSampling({
        '--minibatch-size':
        choice(16, 32, 64),
        '--learning-rate':
        loguniform(-9, -6),
        '--optimizer':
        choice('rmsprop', 'adagrad', 'adam')
    })

    # Create Early Termination Policy
    etpolicy = BanditPolicy(evaluation_interval=2, slack_factor=0.1)

    # Create HyperDrive Run Configuration
    hyper_drive_config = HyperDriveConfig(
        estimator=tf_est,
        hyperparameter_sampling=param_sampler,
        policy=etpolicy,
        primary_metric_name='acc',
        primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
        max_total_runs=iterations,
        max_concurrent_runs=concurrent_runs)

    # Submit the Hyperdrive Run
    print("Submitting Hyperdrive Run")
    hd_run = experiment.submit(hyper_drive_config)
    hd_run.wait_for_completion(raise_on_error=True, show_output=True)
    print("Finishing Run")
    best_run = hd_run.get_best_run_by_primary_metric()
    print(f'##vso[task.setvariable variable=run_id]{best_run.id}')
 def get_cross_validation_hyperdrive_config(self, run_config: ScriptRunConfig) -> HyperDriveConfig:
     """
     Returns a configuration for AzureML Hyperdrive that varies the cross validation split index.
     :param run_config: The AzureML run configuration object that training for an individual model.
     :return: A hyperdrive configuration object.
     """
     return HyperDriveConfig(
         run_config=run_config,
         hyperparameter_sampling=self.get_cross_validation_hyperdrive_sampler(),
         primary_metric_name=TrackedMetrics.Val_Loss.value,
         primary_metric_goal=PrimaryMetricGoal.MINIMIZE,
         max_total_runs=self.get_total_number_of_cross_validation_runs()
     )
 def get_cross_validation_hyperdrive_config(
         self, estimator: Estimator) -> HyperDriveConfig:
     """
     Returns a configuration for AzureML Hyperdrive that varies the cross validation split index.
     :param estimator: The AzureML estimator object that runs model training.
     :return: A hyperdrive configuration object.
     """
     return HyperDriveConfig(
         estimator=estimator,
         hyperparameter_sampling=self.
         get_cross_validation_hyperdrive_sampler(),
         primary_metric_name=TrackedMetrics.Val_Loss.value,
         primary_metric_goal=PrimaryMetricGoal.MINIMIZE,
         max_total_runs=self.get_total_number_of_cross_validation_runs())
 def get_cross_validation_hyperdrive_config(
         self, run_config: ScriptRunConfig) -> HyperDriveConfig:
     """
     Returns a configuration for AzureML Hyperdrive that varies the cross validation split index.
     Because this adds a val/Loss metric it is important that when subclassing LightningContainer
     your implementeation of LightningModule logs val/Loss. There is an example of this in 
     HelloRegression's validation_step method.
     :param run_config: The AzureML run configuration object that training for an individual model.
     :return: A hyperdrive configuration object.
     """
     return HyperDriveConfig(
         run_config=run_config,
         hyperparameter_sampling=GridParameterSampling(
             parameter_space={
                 CROSS_VALIDATION_SPLIT_INDEX_TAG_KEY:
                 choice(list(range(self.number_of_cross_validation_splits)))
             }),
         primary_metric_name=TrackedMetrics.Val_Loss.value,
         primary_metric_goal=PrimaryMetricGoal.MINIMIZE,
         max_total_runs=self.number_of_cross_validation_splits)
示例#9
0
# Create an estimator that uses the remote compute
hyper_estimator = SKLearn(
    source_directory=experiment_folder,
    inputs=[diabetes_ds.as_named_input('diabetes')
            ],  # Pass the dataset as an input
    compute_target=gpu_cluster,
    conda_packages=['pandas', 'ipykernel', 'matplotlib'],
    pip_packages=['azureml-sdk', 'argparse', 'pyarrow'],
    entry_script='diabetes_training.py')

# Configure hyperdrive settings
hyperdrive = HyperDriveConfig(estimator=hyper_estimator,
                              hyperparameter_sampling=params,
                              policy=policy,
                              primary_metric_name='AUC',
                              primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
                              max_total_runs=6,
                              max_concurrent_runs=4)

# Run the experiment
run = experiment.submit(config=hyperdrive)

#Get the best run
best_run = run.get_best_run_by_primary_metric()
best_run_metrics = best_run.get_metrics()
parameter_values = best_run.get_details()['runDefinition']['arguments']

print('Best Run Id: ', best_run.id)
print(' -AUC:', best_run_metrics['AUC'])
print(' -Accuracy:', best_run_metrics['Accuracy'])
示例#10
0
    parameter_sampling = utils.get_parameter_sampling(
        experiment_settings["hyperparameter_sampling"]["method"],
        experiment_settings["hyperparameter_sampling"]["parameters"])
    policy = utils.get_policy(
        experiment_settings["hyperparameter_sampling"]["policy"])
    primary_metric_goal = PrimaryMetricGoal.MAXIMIZE if "max" in experiment_settings[
        "hyperparameter_sampling"][
            "primary_metric_goal"] else PrimaryMetricGoal.MINIMIZE

    run_config = HyperDriveConfig(
        estimator=estimator,
        hyperparameter_sampling=parameter_sampling,
        policy=policy,
        primary_metric_name=experiment_settings["hyperparameter_sampling"]
        ["primary_metric_name"],
        primary_metric_goal=primary_metric_goal,
        max_total_runs=experiment_settings["hyperparameter_sampling"]
        ["max_total_runs"],
        max_concurrent_runs=experiment_settings["hyperparameter_sampling"]
        ["max_concurrent_runs"],
        max_duration_minutes=experiment_settings["hyperparameter_sampling"]
        ["max_duration_minutes"])
else:
    run_config = estimator

# Submitting an Experiment and creating a Run
print("Submitting an experiment and creating a run")
run = exp.submit(run_config, tags=experiment_settings["run_tags"])

# Shows output of the run on stdout
run.wait_for_completion(show_output=True, wait_post_processing=True)
示例#11
0
        hyperdrive.loguniform(
            convert_base(1e-6),
            convert_base(5e-2)),  # NB. loguniform on [exp(min), exp(max)]
        "--weight_decay":
        hyperdrive.uniform(5e-3, 15e-2),
        "--per_device_train_batch_size":
        hyperdrive.choice([16, 32]),
    }

    hyperparameter_sampling = RandomParameterSampling(search_space)

    policy = TruncationSelectionPolicy(truncation_percentage=50,
                                       evaluation_interval=2,
                                       delay_evaluation=0)

    hyperdrive_config = HyperDriveConfig(
        run_config=config,
        hyperparameter_sampling=hyperparameter_sampling,
        policy=policy,
        primary_metric_name="eval_matthews_correlation",
        primary_metric_goal=hyperdrive.PrimaryMetricGoal.MAXIMIZE,
        max_total_runs=20,
        max_concurrent_runs=8,
    )

    run = Experiment(
        ws,
        "transformers-glue-finetuning-hyperdrive").submit(hyperdrive_config)
    print(run.get_portal_url())
    run.wait_for_completion(show_output=True)
示例#12
0
def build_pipeline(dataset, ws, config):
    print("building pipeline for dataset %s in workspace %s" % (dataset, ws.name))

    base_dir = '.'
        
    def_blob_store = ws.get_default_datastore()

    # folder for scripts that need to be uploaded to Aml compute target
    script_folder = './scripts'
    os.makedirs(script_folder, exist_ok=True)
    
    shutil.copy(os.path.join(base_dir, 'video_decoding.py'), script_folder)
    shutil.copy(os.path.join(base_dir, 'pipelines_submit.py'), script_folder)
    shutil.copy(os.path.join(base_dir, 'pipelines_create.py'), script_folder)
    shutil.copy(os.path.join(base_dir, 'train.py'), script_folder)
    shutil.copy(os.path.join(base_dir, 'data_utils.py'), script_folder)
    shutil.copy(os.path.join(base_dir, 'prednet.py'), script_folder)
    shutil.copy(os.path.join(base_dir, 'keras_utils.py'), script_folder)
    shutil.copy(os.path.join(base_dir, 'data_preparation.py'), script_folder)
    shutil.copy(os.path.join(base_dir, 'model_registration.py'), script_folder)
    shutil.copy(os.path.join(base_dir, 'config.json'), script_folder)
    
    cpu_compute_name = config['cpu_compute']
    try:
        cpu_compute_target = AmlCompute(ws, cpu_compute_name)
        print("found existing compute target: %s" % cpu_compute_name)
    except:# ComputeTargetException:
        print("creating new compute target")
        
        provisioning_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2', 
                                                                    max_nodes=4,
                                                                    idle_seconds_before_scaledown=1800)    
        cpu_compute_target = ComputeTarget.create(ws, cpu_compute_name, provisioning_config)
        cpu_compute_target.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20)
        
    # use get_status() to get a detailed status for the current cluster. 
    print(cpu_compute_target.get_status().serialize())

    # choose a name for your cluster
    gpu_compute_name = config['gpu_compute']

    try:
        gpu_compute_target = AmlCompute(workspace=ws, name=gpu_compute_name)
        print("found existing compute target: %s" % gpu_compute_name)
    except: 
        print('Creating a new compute target...')
        provisioning_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_NC6', 
                                                                    max_nodes=10,
                                                                    idle_seconds_before_scaledown=1800)

        # create the cluster
        gpu_compute_target = ComputeTarget.create(ws, gpu_compute_name, provisioning_config)

        # can poll for a minimum number of nodes and for a specific timeout. 
        # if no min node count is provided it uses the scale settings for the cluster
        gpu_compute_target.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20)

    # use get_status() to get a detailed status for the current cluster. 
    try:
        print(gpu_compute_target.get_status().serialize())
    except BaseException as e:
        print("Could not get status of compute target.")
        print(e)

    # conda dependencies for compute targets
    cpu_cd = CondaDependencies.create(conda_packages=["py-opencv=3.4.2"], pip_indexurl='https://azuremlsdktestpypi.azureedge.net/sdk-release/Candidate/604C89A437BA41BD942B4F46D9A3591D', pip_packages=["azure-storage-blob==1.5.0", "hickle==3.4.3", "requests==2.21.0", "sklearn", "pandas==0.24.2", "azureml-sdk", "numpy==1.16.2", "pillow==6.0.0"])
    
    # Runconfigs
    cpu_compute_run_config = RunConfiguration(conda_dependencies=cpu_cd)
    cpu_compute_run_config.environment.docker.enabled = True
    cpu_compute_run_config.environment.docker.gpu_support = False
    cpu_compute_run_config.environment.docker.base_image = DEFAULT_CPU_IMAGE
    cpu_compute_run_config.environment.spark.precache_packages = False

    print("PipelineData object created")

    # DataReference to where video data is stored.
    video_data = DataReference(
        datastore=def_blob_store,
        data_reference_name="video_data",
        path_on_datastore=os.path.join("prednet", "data", "video", dataset))
    print("DataReference object created")
        
    # Naming the intermediate data as processed_data1 and assigning it to the variable processed_data1.
    raw_data = PipelineData("raw_video_fames", datastore=def_blob_store)
    preprocessed_data = PipelineData("preprocessed_video_frames", datastore=def_blob_store)
    data_metrics = PipelineData("data_metrics", datastore=def_blob_store)
    data_output = PipelineData("output_data", datastore=def_blob_store)

    # prepare dataset for training/testing prednet
    video_decoding = PythonScriptStep(
        name='decode_videos',
        script_name="video_decoding.py", 
        arguments=["--input_data", video_data, "--output_data", raw_data],
        inputs=[video_data],
        outputs=[raw_data],
        compute_target=cpu_compute_target, 
        source_directory=script_folder,
        runconfig=cpu_compute_run_config,
        allow_reuse=True,
        hash_paths=['.']
    )
    print("video_decode step created")

    # prepare dataset for training/testing recurrent neural network
    data_prep = PythonScriptStep(
        name='prepare_data',
        script_name="data_preparation.py", 
        arguments=["--input_data", raw_data, "--output_data", preprocessed_data],
        inputs=[raw_data],
        outputs=[preprocessed_data],
        compute_target=cpu_compute_target, 
        source_directory=script_folder,
        runconfig=cpu_compute_run_config,
        allow_reuse=True,
        hash_paths=['.']
    )
    data_prep.run_after(video_decoding)

    print("data_prep step created")


    # configure access to ACR for pulling our custom docker image
    acr = ContainerRegistry()
    acr.address = config['acr_address']
    acr.username = config['acr_username']
    acr.password = config['acr_password']
    
    est = Estimator(source_directory=script_folder,
                    compute_target=gpu_compute_target,
                    entry_script='train.py', 
                    use_gpu=True,
                    node_count=1,
                    custom_docker_image = "wopauli_1.8-gpu:1",
                    image_registry_details=acr,
                    user_managed=True
                    )

    ps = RandomParameterSampling(
        {
            '--batch_size': choice(1, 2, 4, 8),
            '--filter_sizes': choice("3, 3, 3", "4, 4, 4", "5, 5, 5"),
            '--stack_sizes': choice("48, 96, 192", "36, 72, 144", "12, 24, 48"), #, "48, 96"),
            '--learning_rate': loguniform(-6, -1),
            '--lr_decay': loguniform(-9, -1),
            '--freeze_layers': choice("0, 1, 2", "1, 2, 3", "0, 1", "1, 2", "2, 3", "0", "3"),
            '--transfer_learning': choice("True", "False")
        }
    )

    policy = BanditPolicy(evaluation_interval=2, slack_factor=0.1, delay_evaluation=10)

    hdc = HyperDriveConfig(estimator=est, 
                            hyperparameter_sampling=ps, 
                            policy=policy, 
                            primary_metric_name='val_loss', 
                            primary_metric_goal=PrimaryMetricGoal.MINIMIZE, 
                            max_total_runs=10,
                            max_concurrent_runs=5, 
                            max_duration_minutes=60*6
                            )

    hd_step = HyperDriveStep(
        name="train_w_hyperdrive",
        hyperdrive_run_config=hdc,
        estimator_entry_script_arguments=[
            '--data-folder', preprocessed_data, 
            '--remote_execution',
            '--dataset', dataset
            ],
        inputs=[preprocessed_data],
        metrics_output = data_metrics,
        allow_reuse=True
    )
    hd_step.run_after(data_prep)

    registration_step = PythonScriptStep(
        name='register_model',
        script_name='model_registration.py',
        arguments=['--input_dir', data_metrics, '--output_dir', data_output],
        compute_target=cpu_compute_target,
        inputs=[data_metrics],
        outputs=[data_output],
        source_directory=script_folder,
        allow_reuse=True,
        hash_paths=['.']
    )
    registration_step.run_after(hd_step)

    pipeline = Pipeline(workspace=ws, steps=[video_decoding, data_prep, hd_step, registration_step])
    print ("Pipeline is built")

    pipeline.validate()
    print("Simple validation complete") 

    pipeline_name = 'prednet_' + dataset
    published_pipeline = pipeline.publish(name=pipeline_name)
    

    schedule = Schedule.create(workspace=ws, name=pipeline_name + "_sch",
                            pipeline_id=published_pipeline.id, 
                            experiment_name=pipeline_name,
                            datastore=def_blob_store,
                            wait_for_provisioning=True,
                            description="Datastore scheduler for Pipeline" + pipeline_name,
                            path_on_datastore=os.path.join('prednet/data/video', dataset, 'Train'),
                            polling_interval=1
                            )

    return pipeline_name
示例#13
0
    uniform(0.5, 1),
    '--learning_rate':
    uniform(0.005, 0.25),
    '--min_data_in_leaf':
    choice(list(range(2, 501))),
    '--lambda_l1':
    choice(list(range(201))),
    '--lambda_l2':
    choice(list(range(201))),
    '--n_estimators':
    choice(list(range(100, 4001, 100)))
})

# Configure hyperdrive settings
hyperdrive = HyperDriveConfig(run_config=script_config,
                              hyperparameter_sampling=params,
                              policy=None,
                              primary_metric_name='rmse',
                              primary_metric_goal=PrimaryMetricGoal.MINIMIZE,
                              max_total_runs=160,
                              max_concurrent_runs=4)

# Run the experiment
experiment = Experiment(workspace=ws, name='training_hyperdrive03')
run = experiment.submit(config=hyperdrive)
print("Experiment is running...")

# Show the status in the notebook as the experiment runs
# RunDetails(run).show()
run.wait_for_completion()
print("Experiment has done.")
示例#14
0
src = ScriptRunConfig(source_directory="./topicmodel",
                      script='train.py',
                      arguments=args,
                      compute_target=compute_target,
                      environment=env)

param_sampling = RandomParameterSampling({
    "--num-topics": choice(5, 10, 15, 20)
})

# Submit experiment

hd = HyperDriveConfig(run_config=src,
                      hyperparameter_sampling=param_sampling,
                      primary_metric_name="c_v",
                      primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
                      max_total_runs=100,
                      max_concurrent_runs=4)

run = exp.submit(config=hd)

run.wait_for_completion(show_output=False)

print(run.get_metrics())

print(run.get_file_names())

# Register model

best_run = run.get_best_run_by_primary_metric()
示例#15
0
    '--learning_rate': uniform(1e-3, 2e-2),
    '--momentum': uniform(.1, .95),
    '--weight_decay': loguniform(-5, -3),
    '--temperature': uniform(1, 9),
    # '--lambda_const': uniform(.1, .3),
    '--transfer_learning': choice("True", "False")
})

policy = BanditPolicy(evaluation_interval=2,
                      slack_factor=0.1,
                      delay_evaluation=10)

hdc = HyperDriveConfig(
    estimator=est,
    hyperparameter_sampling=ps,
    policy=policy,
    primary_metric_name='val_loss',
    primary_metric_goal=PrimaryMetricGoal.MINIMIZE,
    max_total_runs=5,  #100,
    max_concurrent_runs=5)

hd_step = HyperDriveStep(
    name="train_w_hyperdrive",
    hyperdrive_config=hdc,
    estimator_entry_script_arguments=[
        '--data-folder', labeled_data, '--logits-folder', logits_data,
        '--remote_execution'
    ],
    # estimator_entry_script_arguments=script_params,
    inputs=[labeled_data, logits_data],
    metrics_output=data_metrics,
    allow_reuse=True)
    framework_version='2.0',
    use_gpu=True,
    pip_packages=[
        'transformers==2.0.0', 'azureml-dataprep[fuse,pandas]==1.1.29'
    ])

# %% [markdown]
# Finally, we add all our parameters in a [HyperDriveConfig](https://docs.microsoft.com/en-us/python/api/azureml-train-core/azureml.train.hyperdrive.hyperdriveconfig?view=azure-ml-py) class and submit it as a run.

# %%
from azureml.train.hyperdrive import HyperDriveConfig

hyperdrive_run_config = HyperDriveConfig(
    estimator=estimator4,
    hyperparameter_sampling=param_sampling,
    policy=early_termination_policy,
    primary_metric_name=primary_metric_name,
    primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
    max_total_runs=10,
    max_concurrent_runs=2)

run4 = experiment.submit(hyperdrive_run_config)

# %% [markdown]
# When we view the details of our run this time, we will see information and metrics for every run in our hyperparameter tuning.

# %%
from azureml.widgets import RunDetails
RunDetails(run4).show()

# %% [markdown]
# We can retrieve the best run based on our defined metric.
示例#17
0
        script_params=script_params_3,
        source_directory=os.path.dirname(os.path.realpath(__file__)),
        compute_target=workspace.compute_targets["alwaysoncluster"],
        distributed_training=MpiConfiguration(),
        framework_version='1.4',
        use_gpu=True,
        pip_packages=[
            'numpy==1.15.4', 'pandas==0.23.4', 'scikit-learn==0.20.1',
            'scipy==1.0.0', 'matplotlib==3.0.2', 'utils==0.9.0',
            'onnxruntime==1.2.0', 'onnx==1.6.0'
        ])

    experiment = Experiment(workspace=workspace, name="deeplearning")
    run = experiment.submit(estimator)

    if hyperdrive is True:
        # Define multi-run configuration
        hyperdrive_run_config = HyperDriveConfig(
            estimator=estimator,
            hyperparameter_sampling=param_sampling,
            policy=None,
            primary_metric_name="accuracy",
            primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
            max_total_runs=10,
            max_concurrent_runs=None)

        # Define the ML experiment
        experiment = Experiment(workspace=workspace, name="hyperdrive")
        # Submit the experiment
        run = experiment.submit(hyperdrive_run_config)
示例#18
0
     '--model_type':
     model_type,
     '--max_seq_len':
     choice(128, 256),
     '--embeds_dropout':
     choice(0.1, 0.2, 0.3)
 })
 ## Termination policy
 early_termination_policy = BanditPolicy(slack_factor=0.1,
                                         evaluation_interval=1,
                                         delay_evaluation=3)
 ## Prepare HyperDrive Config
 hdc = HyperDriveConfig(
     estimator=est,
     hyperparameter_sampling=param_sampling,
     policy=early_termination_policy,
     primary_metric_name='f1macro',
     primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
     max_total_runs=40,
     max_concurrent_runs=4)
 ## Run hyperparameter tuning
 hyperdrive_run = exp.submit(config=hdc)
 if args.update_model:
     hyperdrive_run.wait_for_completion(show_output=True)
     ## Get Results
     best_run = hyperdrive_run.get_best_run_by_primary_metric()
     ## Experiment
     experiment_name = args.project_name + "-train"
     exp = Experiment(workspace=ws, name=experiment_name)
     #Parameters determined by hyperparams
     script_params_hyper = {
         '--learning_rate':
示例#19
0
from azureml.core import Workspace, Experiment
from azureml.train.hyperdrive import HyperDriveConfig, PrimaryMetricGoal

ws = Workspace.from_config()

hyperdrive = HyperDriveConfig(run_config=script_config,
                              hyperparameter_sampling=param_sampling,
                              policy=None,
                              primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
                              primary_metric_name=Accuracy,
                              max_total_runs=6,
                              max_concurrent_runs=4)

experiment = Experiment(ws, name="hyper_training")
hyperdrive_run = experiment.submit(config=hyperdrive)

# monitoring the childs

for child_run in run.get_children():
    print(child_run.id, child_run.get_metrics())

for child_run in hyperdrive_run.get_children_sorted_by_primary_metric():
    print(child_run)

best_run = hyperdrive_run.get_best_run_by_primary_metric()
示例#20
0
        entry_script=PathsConfig.entry_script,
        use_gpu=True,
        custom_docker_image=settings["IMAGE_NAME"],
    )
    if GeneralConfig.hyperdrive:
        if GeneralConfig.architecture_type == "PretrainedResNet50":
            hyperparams_space = HyperdriveConfig.pretrained_resnet50_hyperparams_space
        else:
            raise NotImplementedError
        hyperparams_space_format = {
            parameter: choice(parameter_range)
            for parameter, parameter_range in hyperparams_space.items()
        }
        parameters_sampling = RandomParameterSampling(hyperparams_space_format)
        policy = BanditPolicy(
            evaluation_interval=HyperdriveConfig.evaluation_interval,
            slack_factor=HyperdriveConfig.slack_factor,
        )
        hdc = HyperDriveConfig(
            estimator=est,
            hyperparameter_sampling=parameters_sampling,
            policy=policy,
            primary_metric_name="Accuracy",
            primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
            max_total_runs=HyperdriveConfig.max_total_runs,
            max_concurrent_runs=HyperdriveConfig.max_concurrent_runs,
        )
        run = exp.submit(hdc)
    else:
        run = exp.submit(est)
示例#21
0
文件: tf2train.py 项目: mindis/mlops
                 script_params={
                     '--data-folder':
                     dataset.as_named_input('mnist').as_mount()
                 },
                 compute_target=compute_target,
                 entry_script='tf_mnist2.py',
                 framework_version='2.0',
                 use_gpu=True,
                 pip_packages=['azureml-dataprep[pandas,fuse]'])

policy = BanditPolicy(evaluation_interval=2, slack_factor=0.1)

htc = HyperDriveConfig(estimator=est,
                       hyperparameter_sampling=ps,
                       policy=policy,
                       primary_metric_name='validation_acc',
                       primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
                       max_total_runs=8,
                       max_concurrent_runs=4)

htr = exp.submit(config=htc)

#RunDetails(htr).show()

htr.wait_for_completion(show_output=True)

assert (htr.get_status() == "Completed")

best_run = htr.get_best_run_by_primary_metric()

print(best_run.get_file_names())
                             compute_taret = cluster
                            ) 

#creating hyper parmas

from azureml.train.hyperdrive import GridParameterSampling, choice

hyper_params = GridParameterSampling({
    '--n_estimators': choice(10,20,30,100),
    '--min_samples_leaf': choice(1,2,5)
})


#configuring hyperdrive class
from azureml.train.hyperdrive import HyperDriveConfig,PrimaryMetricGoal

hyper_config = HyperDriveConfig(run_config=script_run,
hyperparameter_sampling = hyper_params,
policy= None,
primary_metric_name = 'accuray',
primary_metric_goal = PrimaryMetricGoal.MAXIMIZE,
max_total_runs = 20,
max_concurrent_runs=2)


exp = Experiment(ws, "My_hyperdrive_exp")

new_run = exp.submit(script_run)

new_run.wait_for_completion(show_output = True)
示例#23
0
def create_experiment_config(workspace):
    ########################################
    ### Creating data prep Pipeline Step ###
    ########################################

    # Load settings
    print("Loading settings")
    data_prep_step_path = os.path.join("steps", "data_prep")
    with open(os.path.join(data_prep_step_path, "step.json")) as f:
        data_prep_settings = json.load(f)

    # Setup datasets of first step
    print("Setting up datasets")
    data_prep_input = Dataset.get_by_name(workspace=workspace,
                                          name=data_prep_settings.get(
                                              "dataset_input_name",
                                              None)).as_named_input(
                                                  data_prep_settings.get(
                                                      "dataset_input_name",
                                                      None)).as_mount()
    data_prep_output = PipelineData(
        name=data_prep_settings.get("dataset_output_name", None),
        datastore=Datastore(workspace=workspace,
                            name=data_prep_settings.get(
                                "datastore_output_name",
                                "workspaceblobstore")),
        output_mode="mount").as_dataset()
    # Uncomment next lines, if you want to register intermediate dataset
    #data_prep_output.register(
    #    name=data_prep_settings.get("dataset_output_name", None),
    #    create_new_version=True
    #)

    # Create conda dependencies
    print("Creating conda dependencies")
    data_prep_dependencies = CondaDependencies.create(
        pip_packages=data_prep_settings.get("pip_packages", []),
        conda_packages=data_prep_settings.get("conda_packages", []),
        python_version=data_prep_settings.get("python_version", "3.6.2"))

    # Create run configuration
    print("Creating RunConfiguration")
    data_prep_run_config = RunConfiguration(
        conda_dependencies=data_prep_dependencies,
        framework=data_prep_settings.get("framework", "Python"))

    # Loading compute target
    print("Loading ComputeTarget")
    data_prep_compute_target = ComputeTarget(workspace=workspace,
                                             name=data_prep_settings.get(
                                                 "compute_target_name", None))

    # Create python step
    print("Creating Step")
    data_prep = PythonScriptStep(
        name=data_prep_settings.get("step_name", None),
        script_name=data_prep_settings.get("script_name", None),
        arguments=data_prep_settings.get("arguments", []),
        compute_target=data_prep_compute_target,
        runconfig=data_prep_run_config,
        inputs=[data_prep_input],
        outputs=[data_prep_output],
        params=data_prep_settings.get("parameters", []),
        source_directory=data_prep_step_path,
        allow_reuse=data_prep_settings.get("allow_reuse", True),
        version=data_prep_settings.get("version", None),
    )

    ###############################################
    ### Creating data model train Pipeline Step ###
    ###############################################

    # Load settings
    print("Loading settings")
    model_train_step_path = os.path.join("steps", "model_train")
    with open(os.path.join(model_train_step_path, "step.json")) as f:
        model_train_settings = json.load(f)
    hyperparameter_sampling_settings = model_train_settings.get(
        "hyperparameter_sampling", {})

    # Setup datasets of first step
    print("Setting up datasets")
    model_train_input = data_prep_output.as_named_input(
        name=model_train_settings.get("dataset_input_name", None))
    model_train_output = PipelineData(
        name=model_train_settings.get("dataset_output_name", None),
        datastore=Datastore(workspace=workspace,
                            name=model_train_settings.get(
                                "datastore_output_name", None)),
        output_mode="mount",
    ).as_dataset()
    # Uncomment next lines, if you want to register intermediate dataset
    #model_train_output.register(
    #    name=model_train_settings.get("dataset_output_name", None),
    #    create_new_version=True
    #)

    # Create conda dependencies
    print("Creating conda dependencies")
    model_train_dependencies = CondaDependencies.create(
        pip_packages=model_train_settings.get("pip_packages", []),
        conda_packages=model_train_settings.get("conda_packages", []),
        python_version=model_train_settings.get("python_version", "3.6.2"))

    # Create run configuration
    print("Creating RunConfiguration")
    model_train_run_config = RunConfiguration(
        conda_dependencies=model_train_dependencies,
        framework=model_train_settings.get("framework", "Python"))

    # Loading compute target
    print("Loading ComputeTarget")
    model_train_compute_target = ComputeTarget(workspace=workspace,
                                               name=model_train_settings.get(
                                                   "compute_target_name",
                                                   None))

    # Create distributed training backend
    print("Creating distributed training backend")
    distributed_training_backend = get_distributed_backend(
        backend_name=model_train_settings.get("distributed_backend", None))

    # Create Estimator for Training
    print("Creating Estimator for training")
    model_train_estimator = Estimator(
        source_directory=model_train_step_path,
        entry_script=model_train_settings.get("script_name", None),
        environment_variables=model_train_settings.get("parameters", None),
        compute_target=model_train_compute_target,
        node_count=model_train_settings.get("node_count", None),
        distributed_training=distributed_training_backend,
        conda_packages=model_train_settings.get("conda_packages", None),
        pip_packages=model_train_settings.get("pip_packages", None),
    )

    try:
        # Create parameter sampling
        print("Creating Parameter Sampling")
        parameter_dict = {}
        parameters = hyperparameter_sampling_settings.get(
            "parameters",
            {}) if "parameters" in hyperparameter_sampling_settings else {}
        for parameter_name, parameter_details in parameters.items():
            parameter_distr = get_parameter_distribution(
                distribution=parameter_details.get("distribution", None),
                **parameter_details.get("settings", {}))
            parameter_dict[f"--{parameter_name}"] = parameter_distr
        model_train_ps = get_parameter_sampling(
            sampling_method=hyperparameter_sampling_settings.get(
                "method", None),
            parameter_dict=parameter_dict)

        # Get Policy definition
        policy_settings = hyperparameter_sampling_settings.get("policy", {})
        kwargs = {
            key: value
            for key, value in policy_settings.items() if key not in
            ["policy_method", "evaluation_interval", "delay_evaluation"]
        }

        # Create termination policy
        print("Creating early termination policy")
        model_train_policy = get_policy(
            policy_method=policy_settings.get("method", ""),
            evaluation_interval=policy_settings.get("evaluation_interval",
                                                    None),
            delay_evaluation=policy_settings.get("delay_evaluation", None),
            **kwargs)

        # Create HyperDriveConfig
        print("Creating HyperDriveConfig")
        model_train_hyperdrive_config = HyperDriveConfig(
            estimator=model_train_estimator,
            hyperparameter_sampling=model_train_ps,
            policy=model_train_policy,
            primary_metric_name=hyperparameter_sampling_settings.get(
                "primary_metric", None),
            primary_metric_goal=PrimaryMetricGoal.MINIMIZE
            if "min" in hyperparameter_sampling_settings.get(
                "primary_metric_goal", None) else PrimaryMetricGoal.MAXIMIZE,
            max_total_runs=hyperparameter_sampling_settings.get(
                "max_total_runs", 1),
            max_concurrent_runs=hyperparameter_sampling_settings.get(
                "max_concurrent_runs", 1),
            max_duration_minutes=hyperparameter_sampling_settings.get(
                "max_duration_minutes", None))

        # Create HyperDriveStep
        print("Creating HyperDriveStep")
        model_train = HyperDriveStep(
            name=model_train_settings.get("step_name", None),
            hyperdrive_config=model_train_hyperdrive_config,
            estimator_entry_script_arguments=model_train_settings.get(
                "arguments", None),
            inputs=[model_train_input],
            outputs=[model_train_output],
            allow_reuse=model_train_settings.get("allow_reuse", True),
            version=model_train_settings.get("version", True))
    except:
        print("Not all required parameters specified for HyperDrive step")

        # Create EstimatorStep
        print("Creating EstimatorStep")
        model_train = EstimatorStep(
            name=model_train_settings.get("step_name", None),
            estimator=model_train_estimator,
            estimator_entry_script_arguments=model_train_settings.get(
                "arguments", None),
            inputs=[model_train_input],
            outputs=[model_train_output],
            compute_target=model_train_compute_target,
            allow_reuse=model_train_settings.get("allow_reuse", True),
            version=model_train_settings.get("version", True))

    #########################
    ### Creating Pipeline ###
    #########################

    # Create Pipeline
    print("Creating Pipeline")
    pipeline = Pipeline(
        workspace=workspace,
        steps=[model_train],
        description="Training Pipeline",
    )

    # Validate pipeline
    print("Validating pipeline")
    pipeline.validate()

    return pipeline
示例#24
0
def build_prednet_pipeline(dataset, ws):
    print("building pipeline for dataset %s in workspace %s" %
          (dataset, ws.name))

    base_dir = "."

    def_blob_store = ws.get_default_datastore()

    # folder for scripts that need to be uploaded to Aml compute target
    script_folder = "./scripts"
    os.makedirs(script_folder)

    shutil.copytree(os.path.join(base_dir, "models"),
                    os.path.join(base_dir, script_folder, "models"))
    shutil.copy(os.path.join(base_dir, "train.py"), script_folder)
    shutil.copy(os.path.join(base_dir, "data_preparation.py"), script_folder)
    shutil.copy(os.path.join(base_dir, "register_prednet.py"), script_folder)
    shutil.copy(os.path.join(base_dir, "batch_scoring.py"), script_folder)
    shutil.copy(os.path.join(base_dir, "train_clf.py"), script_folder)
    shutil.copy(os.path.join(base_dir, "register_clf.py"), script_folder)

    cpu_compute_name = args.cpu_compute_name
    cpu_compute_target = AmlCompute(ws, cpu_compute_name)
    print("found existing compute target: %s" % cpu_compute_name)

    # use get_status() to get a detailed status for the current cluster.
    print(cpu_compute_target.get_status().serialize())

    # choose a name for your cluster
    gpu_compute_name = args.gpu_compute_name

    gpu_compute_target = AmlCompute(workspace=ws, name=gpu_compute_name)
    print(gpu_compute_target.get_status().serialize())

    env = Environment.get(ws, "prednet")

    # Runconfigs
    runconfig = RunConfiguration()
    runconfig.environment = env
    print("PipelineData object created")

    # DataReference to where raw data is stored.
    raw_data = DataReference(
        datastore=def_blob_store,
        data_reference_name="raw_data",
        path_on_datastore=os.path.join("prednet", "data", "raw_data"),
    )
    print("DataReference object created")

    # Naming the intermediate data as processed_data and assigning it to the
    # variable processed_data.
    preprocessed_data = PipelineData("preprocessed_data",
                                     datastore=def_blob_store)
    data_metrics = PipelineData("data_metrics", datastore=def_blob_store)
    hd_child_cwd = PipelineData("prednet_model_path", datastore=def_blob_store)
    # prednet_path = PipelineData("outputs", datastore=def_blob_store)
    scored_data = PipelineData("scored_data", datastore=def_blob_store)
    model_path = PipelineData("model_path", datastore=def_blob_store)

    # prepare dataset for training/testing recurrent neural network
    data_prep = PythonScriptStep(
        name="prepare_data",
        script_name="data_preparation.py",
        arguments=[
            "--raw_data",
            raw_data,
            "--preprocessed_data",
            preprocessed_data,
            "--dataset",
            dataset,
        ],
        inputs=[raw_data],
        outputs=[preprocessed_data],
        compute_target=cpu_compute_target,
        source_directory=script_folder,
        runconfig=runconfig,
        allow_reuse=True,
    )
    # data_prep.run_after(video_decoding)

    print("data_prep step created")

    est = Estimator(
        source_directory=script_folder,
        compute_target=gpu_compute_target,
        entry_script="train.py",
        node_count=1,
        environment_definition=env,
    )

    ps = BayesianParameterSampling({
        "--batch_size":
        choice(1, 2, 4, 10),
        "--filter_sizes":
        choice("3, 3, 3", "4, 4, 4", "5, 5, 5"),
        "--stack_sizes":
        choice("48, 96, 192", "36, 72, 144", "12, 24, 48"),
        "--learning_rate":
        uniform(1e-6, 1e-3),
        "--lr_decay":
        uniform(1e-9, 1e-2),
        "--freeze_layers":
        choice("0, 1, 2", "1, 2, 3", "0, 1", "1, 2", "2, 3", "0", "3"),
        # "--fine_tuning": choice("True", "False"),
    })

    hdc = HyperDriveConfig(
        estimator=est,
        hyperparameter_sampling=ps,
        primary_metric_name="val_loss",
        primary_metric_goal=PrimaryMetricGoal.MINIMIZE,
        max_total_runs=3,
        max_concurrent_runs=3,
        max_duration_minutes=60 * 6,
    )

    train_prednet = HyperDriveStep(
        "train_w_hyperdrive",
        hdc,
        estimator_entry_script_arguments=[
            "--preprocessed_data",
            preprocessed_data,
            "--remote_execution",
            "--dataset",
            dataset,
        ],
        inputs=[preprocessed_data],
        outputs=[hd_child_cwd],
        metrics_output=data_metrics,
        allow_reuse=True,
    )
    train_prednet.run_after(data_prep)

    register_prednet = PythonScriptStep(
        name="register_prednet",
        script_name="register_prednet.py",
        arguments=[
            "--data_metrics",
            data_metrics,
        ],
        compute_target=cpu_compute_target,
        inputs=[data_metrics, hd_child_cwd],
        source_directory=script_folder,
        allow_reuse=True,
    )
    register_prednet.run_after(train_prednet)

    batch_scoring = PythonScriptStep(
        name="batch_scoring",
        script_name="batch_scoring.py",
        arguments=[
            "--preprocessed_data",
            preprocessed_data,
            "--scored_data",
            scored_data,
            "--dataset",
            dataset,
            # "--prednet_path",
            # prednet_path
        ],
        compute_target=gpu_compute_target,
        inputs=[preprocessed_data],
        outputs=[scored_data],
        source_directory=script_folder,
        runconfig=runconfig,
        allow_reuse=True,
    )
    batch_scoring.run_after(register_prednet)

    train_clf = PythonScriptStep(
        name="train_clf",
        script_name="train_clf.py",
        arguments=[
            "--preprocessed_data", preprocessed_data, "--scored_data",
            scored_data, "--model_path", model_path
        ],
        compute_target=cpu_compute_target,
        inputs=[preprocessed_data, scored_data],
        outputs=[model_path],
        source_directory=script_folder,
        runconfig=runconfig,
        allow_reuse=True,
    )
    train_clf.run_after(batch_scoring)

    register_clf = PythonScriptStep(
        name="register_clf",
        script_name="register_clf.py",
        arguments=["--model_path", model_path],
        inputs=[model_path],
        compute_target=cpu_compute_target,
        source_directory=script_folder,
        allow_reuse=True,
        runconfig=runconfig,
    )
    register_clf.run_after(train_clf)

    pipeline = Pipeline(
        workspace=ws,
        steps=[
            data_prep,
            train_prednet,
            register_prednet,
            batch_scoring,
            train_clf,
            register_clf,
        ],
    )
    pipeline.validate()

    pipeline_name = "prednet_" + dataset
    published_pipeline = pipeline.publish(name=pipeline_name)

    _ = Schedule.create(
        workspace=ws,
        name=pipeline_name + "_sch",
        pipeline_id=published_pipeline.id,
        experiment_name=pipeline_name,
        datastore=def_blob_store,
        wait_for_provisioning=True,
        description="Datastore scheduler for Pipeline" + pipeline_name,
        path_on_datastore=os.path.join("prednet/data/raw_data", dataset,
                                       "Train"),
        polling_interval=60 * 24,
    )

    published_pipeline.submit(ws, pipeline_name)