def get_run_configs(ws: Workspace, computetarget: ComputeTarget,
                    env: Env) -> Tuple[ParallelRunConfig, RunConfiguration]:
    """
    Creates the necessary run configurations required by the
    pipeline to enable parallelized scoring.

    :param ws: AML Workspace
    :param computetarget: AML Compute target
    :param env: Environment Variables

    :returns: Tuple[Scoring Run configuration, Score copy run configuration]
    """

    # get a conda environment for scoring
    environment = get_environment(
        ws,
        env.aml_env_name_scoring,
        conda_dependencies_file=env.aml_env_score_conda_dep_file,
        enable_docker=True,
        use_gpu=env.use_gpu_for_scoring,
        create_new=env.rebuild_env_scoring,
    )

    score_run_config = ParallelRunConfig(
        entry_script=env.batchscore_script_path,
        source_directory=env.sources_directory_train,
        error_threshold=10,
        output_action="append_row",
        compute_target=computetarget,
        node_count=env.max_nodes_scoring,
        environment=environment,
        run_invocation_timeout=300,
    )

    copy_run_config = RunConfiguration()
    copy_run_config.environment = get_environment(
        ws,
        env.aml_env_name_score_copy,
        conda_dependencies_file=env.aml_env_scorecopy_conda_dep_file,
        enable_docker=True,
        use_gpu=env.use_gpu_for_scoring,
        create_new=env.rebuild_env_scoring,
    )
    return (score_run_config, copy_run_config)
示例#2
0
def test_perf(perf_test_configuration, workspace, request, wheel_file):
    print("Starting with test case {}".format(request.node.name))

    script_name = determine_script_name(request.node.name)
    generate_script(request, perf_test_configuration, script_name, SCRIPT_DIRECTORY)

    experiment = Experiment(workspace=workspace, name=EXPERIMENT_NAME)
    compute_target = workspace.get_default_compute_target(type='cpu')
    run_config = RunConfiguration()
    run_config.target = compute_target

    environment = configure_environment(workspace, wheel_file=wheel_file)
    run_config.environment = environment
    environment.register(workspace=workspace)
    script_run_config = ScriptRunConfig(source_directory=SCRIPT_DIRECTORY,
                                        script=script_name,
                                        run_config=run_config)
    print("submitting run")
    experiment.submit(config=script_run_config, tags=perf_test_configuration.__dict__)
    print("submitted run")
示例#3
0
def _submit_profile(dataset_profile_config_object, workspace, experiment_name):
    """Start Profile execution with the given config on the given workspace.

    :param dataset_profile_config_object:
    :param workspace:
    :param experiment_name:
    :param kwargs:
    :return:
    """
    dataset = dataset_profile_config_object._dataset
    compute_target = dataset_profile_config_object._compute_target

    if isinstance(compute_target, ComputeTarget):
        compute_target = compute_target.name
    else:
        compute_target = compute_target
    run_id = 'dataset_' + str(uuid.uuid4())
    saved_dataset_id = dataset._ensure_saved(workspace)
    action_dto = _restclient(workspace).dataset.generate_profile_with_preview(
        workspace.subscription_id,
        workspace.resource_group,
        workspace.name,
        id=saved_dataset_id,
        compute_target=compute_target,
        experiment_name=experiment_name,
        run_id=run_id,
        custom_headers=_custom_headers)

    if dataset_profile_config_object._compute_target == _LOCAL_COMPUTE:
        with tempfile.TemporaryDirectory() as temp_dir:
            script = os.path.join(temp_dir, 'profile_run_script.py')
            copyfile(
                os.path.join(os.path.dirname(__file__),
                             '_profile_run_script.py'), script)
            run_local = RunConfiguration()
            run_local.environment.python.user_managed_dependencies = True
            run_local.environment.python.interpreter_path = sys.executable
            script_config = ScriptRunConfig(source_directory=temp_dir,
                                            script="profile_run_script.py",
                                            arguments=[
                                                action_dto.dataset_id,
                                                action_dto.action_id,
                                                saved_dataset_id
                                            ],
                                            run_config=run_local)
            experiment = Experiment(workspace, experiment_name)
            experiment.submit(script_config, run_id=run_id)
    else:
        experiment = Experiment(workspace, action_dto.experiment_name)
        run_id = action_dto.run_id
    run = get_run(experiment, run_id)
    return DatasetProfileRun(workspace, dataset, run)
    def __init__(self, kubeflow_component):
        self._comp = kubeflow_component

        run_config = RunConfiguration()
        run_config.target = 'zhizhu-compute'
        run_config.environment.docker.enabled = True
        run_config.environment.docker.base_image = self._comp.image

        print(f"== Creating KubeflowComponentStep: name={self._comp.name}\n"
              f"   arguments={self._comp.command_and_args}\n"
              f"   inputs={self._comp.input_refs}\n"
              f"   outputs={self._comp.output_refs}\n")

        super().__init__(name=self._comp.name,
                         source_directory='script',
                         script_name='invoker.py',
                         arguments=self._comp.command_and_args,
                         inputs=self._comp.input_refs,
                         outputs=self._comp.output_refs,
                         compute_target='zhizhu-compute',
                         allow_reuse=True,
                         runconfig=run_config)
示例#5
0
    def scale_up(self, workers=1):
        """ Scale up the number of workers.
        """
        run_config = RunConfiguration()
        run_config.target = self.compute_target
        run_config.environment = self.environment_definition

        scheduler_ip = self.run.get_metrics()["scheduler"]
        args = [
            f"--scheduler_ip_port={scheduler_ip}",
            f"--use_gpu={self.use_gpu}",
            f"--n_gpus_per_node={self.n_gpus_per_node}",
            f"--worker_death_timeout={self.worker_death_timeout}",
        ]

        file_dataset_registered_name = self.kwargs.get(
            'file_dataset_registered_name', None)
        dataset_config_name = self.kwargs.get('dataset_config_name', None)
        path_on_compute = self.kwargs.get('path_on_compute', None)
        if path_on_compute is not None:
            dataset = Dataset.get_by_name(workspace=self.workspace,
                                          name=file_dataset_registered_name)
            input1 = dataset.as_named_input(dataset_config_name).as_mount(
                path_on_compute=path_on_compute)
            args.append(input1)

        child_run_config = ScriptRunConfig(
            source_directory=os.path.join(self.abs_path, "setup"),
            script="start_worker.py",
            arguments=args,
            run_config=run_config,
        )

        for i in range(workers):
            child_run = self.run.submit_child(child_run_config, tags=self.tags)
            self.workers_list.append(child_run)
            hostname = socket.gethostname()
示例#6
0
    def scale_up(self, workers=1):
        """Scale up the number of workers."""
        run_config = RunConfiguration()
        run_config.target = self.compute_target
        run_config.environment = self.environment_definition

        scheduler_ip = self.run.get_metrics()["scheduler"]
        args = [
            f"--scheduler_ip_port={scheduler_ip}",
            f"--use_gpu={self.use_gpu}",
            f"--n_gpus_per_node={self.n_gpus_per_node}",
            f"--worker_death_timeout={self.worker_death_timeout}",
        ]

        child_run_config = ScriptRunConfig(
            source_directory=os.path.join(self.abs_path, "setup"),
            script="start_worker.py",
            arguments=args,
            run_config=run_config,
        )

        for i in range(workers):
            child_run = self.run.submit_child(child_run_config, tags=self.tags)
            self.workers_list.append(child_run)
    def _generate_run_config(self, step):
        """
        generates an AzML run config if the user gives specifics about requirements

        :param dict step: step defined by user that we are currently building

        :returns: run_config
        :rtype: RunConfiguration
        """
        try:
            conda_deps = self._get_conda_deps(step)
            conda_deps.add_conda_package("pip==20.0.2")
            return RunConfiguration(script=step["script"],
                                    conda_dependencies=conda_deps)
        except KeyError:
            return None
示例#8
0
def get_automl_environment(workspace: Workspace,
                           automl_settings_dict: AzureAutoMLSettings):
    from azureml.core import RunConfiguration
    from azureml.train.automl._environment_utilities import modify_run_configuration
    import logging
    null_logger = logging.getLogger("manymodels_null_logger")
    null_logger.addHandler(logging.NullHandler())
    null_logger.propagate = False
    automl_settings_obj = AzureAutoMLSettings.from_string_or_dict(
        automl_settings_dict)
    run_configuration = modify_run_configuration(automl_settings_obj,
                                                 RunConfiguration(),
                                                 logger=null_logger)
    train_env = run_configuration.environment
    train_env.environment_variables['DISABLE_ENV_MISMATCH'] = True
    train_env.environment_variables['AZUREML_FLUSH_INGEST_WAIT'] = ''
    train_env.environment_variables['AZUREML_METRICS_POLLING_INTERVAL'] = '30'
    return run_configuration.environment
示例#9
0
    def get_run_config(self):
        def _get_structured_interface_param(name, param_list):
            return next((param for param in param_list if param.name == name),
                        None)

        param_list = self.default_module_version.interface.parameters
        conda_content = _get_structured_interface_param(
            'CondaDependencies', param_list).default_value
        docker_enabled = _get_structured_interface_param(
            'DockerEnabled', param_list).default_value
        base_docker_image = _get_structured_interface_param(
            'BaseDockerImage', param_list).default_value
        conda_dependencies = CondaDependencies(
            _underlying_structure=ruamel.yaml.safe_load(conda_content))

        run_config = RunConfiguration()
        run_config.environment.docker.enabled = docker_enabled
        run_config.environment.docker.base_image = base_docker_image
        run_config.environment.python.conda_dependencies = conda_dependencies
        return run_config
示例#10
0
def load_runconfig_yaml(runconfig_yaml_file):
    try:
        run_config = RunConfiguration().load(
            path=runconfig_yaml_file
        )

        # Setting source directory for script run config
        source_directory = os.path.split(runconfig_yaml_file)[0]
        source_directory = os.path.split(source_directory)[0] if os.path.split(source_directory)[-1] == ".azureml" or os.path.split(source_directory)[-1] == "aml_config" else source_directory

        # defining scriptrunconfig
        run_config = ScriptRunConfig(
            source_directory=source_directory,
            run_config=run_config
        )
    except TypeError as exception:
        print(f"::debug::Error when loading runconfig yaml definition your repository (Path: /{runconfig_yaml_file}): {exception}")
        run_config = None
    except FileNotFoundError as exception:
        print(f"::debug::Error when loading runconfig yaml definition your repository (Path: /{runconfig_yaml_file}): {exception}")
        run_config = None
    return run_config
curr_dir = os.path.dirname(os.path.realpath(__file__))
output_dir = 'outputs'
output_dir_local = os.path.join(curr_dir, '../../../', 'outputs')

# Pipeline parameters
run_experiment = True
register_model = False
publish_pipeline = False

# load workspace config, load default datastore.
ws = Workspace.from_config(auth=AzureCliAuthentication())
default_ds = ws.get_default_datastore()

# load run config
run_config = RunConfiguration.load(path=os.path.join(curr_dir, '../../../',
                                                     'aml_config'),
                                   name=run_config_name)

# define training pipeline with one AMLCompute step
trainStep = PythonScriptStep(
    script_name="train.py",
    name="Model Training",
    arguments=[
        '--data-dir',
        str(default_ds.as_mount()), '--output-dir', output_dir
    ],
    inputs=[DataReference(datastore=default_ds, mode="mount")],
    outputs=[
        PipelineData(name="model",
                     datastore=default_ds,
                     output_path_on_compute="training")
示例#12
0
###
# Define and set up pipeline
###

pipeline_param = PipelineParameter(name="my_arg", default_value="default")

my_step = PythonScriptStep(
    name="My Script Step",
    script_name="scriptstep.py",
    arguments=[pipeline_param],
    inputs=[],
    outputs=[],
    compute_target=compute_target,
    source_directory="src",
    allow_reuse=True,
    runconfig=RunConfiguration(conda_dependencies=CondaDependencies(
        conda_dependencies_file_path="environment.yml")),
)

pipeline_id, pipeline_endpoint = publish_pipeline(ws, [my_step], "blabla")

###
# Trigger pipeline via REST API
###

# To trigger the pipeline, a service principal is required: https://docs.microsoft.com/en-us/azure/machine-learning/how-to-setup-authentication

token = requests.post(
    f"{config['sp']['resource_url']}/{config['sp']['tenant_id']}/oauth2/token",
    data={
        "grant_type": "client_credentials",
        "client_id": config["sp"]["client_id"],
示例#13
0
def submit_hyperdrive(
        experiment,
        hyperdrive_configuration_name,
        source_directory,
        run_configuration_name,
        path=None,
        run_async=None,
        conda_dependencies=None,
        ct_name=None,
        user_script_and_arguments=None,
        logger=None):
    from azureml.train.hyperdrive.runconfig import HyperDriveConfig, PrimaryMetricGoal
    policies = {
        "BANDITPOLICY": _get_bandit_policy,
        "MEDIANSTOPPINGPOLICY": _get_median_stopping_policy,
        "TRUNCATIONSELECTIONPOLICY": _get_truncation_selection_policy,
        "NOTERMINATIONPOLICY": _get_no_termination_policy
    }

    samplings = {
        "RANDOM": _get_random_sampling,
        "GRID": _get_grid_sampling,
        "BAYESIAN": _get_bayesian_sampling
    }

    if user_script_and_arguments and len(user_script_and_arguments) > 0:
        script, arguments = user_script_and_arguments[0], user_script_and_arguments[1:]
    else:
        script, arguments = None, None

    if run_configuration_name is None:
        raise UserErrorException("Please specify the name of the run configuration to use.")
    else:
        run_config = RunConfiguration.load(path, run_configuration_name)

    if conda_dependencies:
        from azureml.core.conda_dependencies import CondaDependencies
        cd = CondaDependencies(conda_dependencies_file_path=conda_dependencies)
        run_config.environment.python.conda_dependencies = cd

    if not run_config.script and not script:
        raise UserErrorException("Please specify the script to run either via parameter or in the runconfig")

    if run_config.script and script:
        logger.info("Overriding runconfig script %s with script argument %s", run_config.script, script)

    if script:
        run_config.script = script

    if run_config.arguments and arguments:
        logger.info("Overriding runconfig arguments %s with  %s", run_config.arguments, arguments)

    if arguments:
        run_config.arguments = arguments

    if ct_name:
        run_config.target = ct_name

    logger.info("Running %s with arguments %s", run_config.script, run_config.arguments)

    # default to path if source directory is missing.
    if source_directory is None:
        source_directory = path

    script_run_config = ScriptRunConfig(source_directory=source_directory, run_config=run_config)

    # Support absolute or relative to working directory file location.
    if os.path.isfile(hyperdrive_configuration_name):
        hd_config_file_path = hyperdrive_configuration_name
    else:
        # otherwise look for file where run config files are located (sub-folder of path)
        for root, dirs, files in os.walk(path):
            for file in files:
                if file.endswith(hyperdrive_configuration_name):
                    hd_config_file_path = os.path.join(root, file)

    with open(hd_config_file_path, "r") as hstream:
        hyperdrive_dict = ruamel.yaml.safe_load(hstream)

    hyperparameter_sampling_type = hyperdrive_dict.get('sampling').get('type')
    if hyperparameter_sampling_type is None:
        raise ValueError("Please provide hyperparameter sampling type in hyperdrive configuration file.")

    hyperparameter_sampling = samplings[hyperparameter_sampling_type.upper()](hyperdrive_dict)
    policy_type = hyperdrive_dict.get('policy').get('type', 'NOTERMINATIONPOLICY')
    policy = policies[policy_type.upper()](hyperdrive_dict)
    primary_metric_goal = PrimaryMetricGoal.from_str(hyperdrive_dict.get('primary_metric_goal'))
    hyperdrive_config = HyperDriveConfig(hyperparameter_sampling=hyperparameter_sampling,
                                         primary_metric_name=hyperdrive_dict.get('primary_metric_name'),
                                         primary_metric_goal=primary_metric_goal,
                                         max_total_runs=hyperdrive_dict.get('max_total_runs'),
                                         max_concurrent_runs=hyperdrive_dict.get('max_concurrent_runs'),
                                         max_duration_minutes=hyperdrive_dict.get('max_duration_minutes'),
                                         policy=policy,
                                         run_config=script_run_config)
    run = experiment.submit(hyperdrive_config)
    logger.debug("Running asynchronously: %s", run_async)
    if not run_async:
        run.wait_for_completion(show_output=True, wait_post_processing=True)

    return _run_to_output_dict(run)
logger.setLevel("INFO")
ch = logging.StreamHandler()
ch.setLevel(logging.INFO)
formatter = logging.Formatter(
    "%(asctime)s - %(name)s - %(levelname)s - %(message)s")
ch.setFormatter(formatter)
logger.addHandler(ch)

# GET WS, EXP, ENV and COMPUTE TARGET

ws = Workspace.from_config()
experiment = Experiment(ws,
                        "MaxFreezerTemperatureExceededPipeline",
                        _create_in_cloud=True)
compute_target = ComputeTarget(ws, "freezertrain")
run_config = RunConfiguration()
freezer_environment = ws.environments["sktime_freezer_environment"]
run_config.environment = freezer_environment
logger.info("Environment complete")

# PIPELINE PARAMS

output_df_long = PipelineData("output_df_long",
                              datastore=ws.get_default_datastore())
output_df_nested = PipelineData("output_df_nested",
                                datastore=ws.get_default_datastore())
time_series_length_param = PipelineParameter(name="time_series_length",
                                             default_value=10)
threshold_param = PipelineParameter(name="threshold", default_value=180.0)
dataset_name_param = PipelineParameter(name="dataset_name",
                                       default_value="processed_json")
# Create the environment
tf_env = Environment(ENV_NAME)
tf_env.docker.enabled = True
tf_env.docker.base_image = BASE_IMAGE

# Define additional packages to be installed
conda_dep = CondaDependencies()
conda_dep.add_pip_package('tensorflow-gpu==2.3.0')
conda_dep.add_pip_package('pillow')

# Add packages to the environment
tf_env.python.conda_dependencies = conda_dep

# Create the configuration of an experiment
aml_run_config = RunConfiguration()
aml_run_config.environment = tf_env
# The name of the custome environment must not start by 'AzureML'
# https://github.com/MicrosoftDocs/azure-docs/issues/65770#issuecomment-724536550
aml_run_config.environment.name = 'road-segmentation-GPU'

# Create the compute target
compute_target = createAmlCompute(ws, CLUSTER_NAME, VM_SIZE)

dm = DataManager(ws)

# Obtain training set
images_dataset = dm.filterDataset('training', 'images/**/*.png')
labels_dataset = dm.filterDataset('training', 'labels/**/*_road_*.png')
scoring_images, training_images = dm.splitDataset(images_dataset,
                                                  0.2,
示例#16
0
import sys
from typing import Tuple

import click
from azureml.core import (ComputeTarget, Dataset, Environment,
                          RunConfiguration, Workspace)
from azureml.core.authentication import AzureCliAuthentication
from azureml.core.experiment import Experiment
from azureml.pipeline.core import (Pipeline, PipelineData, PipelineParameter,
                                   PublishedPipeline)
from azureml.pipeline.steps import DatabricksStep, PythonScriptStep

CLI_AUTH = AzureCliAuthentication()
# noinspection PyTypeChecker
WS = Workspace.from_config(auth=CLI_AUTH)
RC = RunConfiguration()
RC.environment = Environment.get(WS, "lightgbm")


# noinspection PyTypeChecker
def create_databricks_step(
        input_dataset: Dataset, compute: ComputeTarget,
        debug_run: bool) -> Tuple[DatabricksStep, PipelineData]:
    output_data = PipelineData(name="ParquetFiles",
                               datastore=WS.get_default_datastore(),
                               is_directory=True)

    node_size = 'Standard_DS4_v2'
    spark_version = '7.3.x-cpu-ml-scala2.12'

    db_step = DatabricksStep(
示例#17
0
def create_experiment_config(workspace):
    ########################################
    ### Creating data prep Pipeline Step ###
    ########################################

    # Load settings
    print("Loading settings")
    data_prep_step_path = os.path.join("steps", "data_prep")
    with open(os.path.join(data_prep_step_path, "step.json")) as f:
        data_prep_settings = json.load(f)

    # Setup datasets of first step
    print("Setting up datasets")
    data_prep_input = Dataset.get_by_name(workspace=workspace,
                                          name=data_prep_settings.get(
                                              "dataset_input_name",
                                              None)).as_named_input(
                                                  data_prep_settings.get(
                                                      "dataset_input_name",
                                                      None)).as_mount()
    data_prep_output = PipelineData(
        name=data_prep_settings.get("dataset_output_name", None),
        datastore=Datastore(workspace=workspace,
                            name=data_prep_settings.get(
                                "datastore_output_name",
                                "workspaceblobstore")),
        output_mode="mount").as_dataset()
    # Uncomment next lines, if you want to register intermediate dataset
    #data_prep_output.register(
    #    name=data_prep_settings.get("dataset_output_name", None),
    #    create_new_version=True
    #)

    # Create conda dependencies
    print("Creating conda dependencies")
    data_prep_dependencies = CondaDependencies.create(
        pip_packages=data_prep_settings.get("pip_packages", []),
        conda_packages=data_prep_settings.get("conda_packages", []),
        python_version=data_prep_settings.get("python_version", "3.6.2"))

    # Create run configuration
    print("Creating RunConfiguration")
    data_prep_run_config = RunConfiguration(
        conda_dependencies=data_prep_dependencies,
        framework=data_prep_settings.get("framework", "Python"))

    # Loading compute target
    print("Loading ComputeTarget")
    data_prep_compute_target = ComputeTarget(workspace=workspace,
                                             name=data_prep_settings.get(
                                                 "compute_target_name", None))

    # Create python step
    print("Creating Step")
    data_prep = PythonScriptStep(
        name=data_prep_settings.get("step_name", None),
        script_name=data_prep_settings.get("script_name", None),
        arguments=data_prep_settings.get("arguments", []),
        compute_target=data_prep_compute_target,
        runconfig=data_prep_run_config,
        inputs=[data_prep_input],
        outputs=[data_prep_output],
        params=data_prep_settings.get("parameters", []),
        source_directory=data_prep_step_path,
        allow_reuse=data_prep_settings.get("allow_reuse", True),
        version=data_prep_settings.get("version", None),
    )

    ###############################################
    ### Creating data model train Pipeline Step ###
    ###############################################

    # Load settings
    print("Loading settings")
    model_train_step_path = os.path.join("steps", "model_train")
    with open(os.path.join(model_train_step_path, "step.json")) as f:
        model_train_settings = json.load(f)
    hyperparameter_sampling_settings = model_train_settings.get(
        "hyperparameter_sampling", {})

    # Setup datasets of first step
    print("Setting up datasets")
    model_train_input = data_prep_output.as_named_input(
        name=model_train_settings.get("dataset_input_name", None))
    model_train_output = PipelineData(
        name=model_train_settings.get("dataset_output_name", None),
        datastore=Datastore(workspace=workspace,
                            name=model_train_settings.get(
                                "datastore_output_name", None)),
        output_mode="mount",
    ).as_dataset()
    # Uncomment next lines, if you want to register intermediate dataset
    #model_train_output.register(
    #    name=model_train_settings.get("dataset_output_name", None),
    #    create_new_version=True
    #)

    # Create conda dependencies
    print("Creating conda dependencies")
    model_train_dependencies = CondaDependencies.create(
        pip_packages=model_train_settings.get("pip_packages", []),
        conda_packages=model_train_settings.get("conda_packages", []),
        python_version=model_train_settings.get("python_version", "3.6.2"))

    # Create run configuration
    print("Creating RunConfiguration")
    model_train_run_config = RunConfiguration(
        conda_dependencies=model_train_dependencies,
        framework=model_train_settings.get("framework", "Python"))

    # Loading compute target
    print("Loading ComputeTarget")
    model_train_compute_target = ComputeTarget(workspace=workspace,
                                               name=model_train_settings.get(
                                                   "compute_target_name",
                                                   None))

    # Create distributed training backend
    print("Creating distributed training backend")
    distributed_training_backend = get_distributed_backend(
        backend_name=model_train_settings.get("distributed_backend", None))

    # Create Estimator for Training
    print("Creating Estimator for training")
    model_train_estimator = Estimator(
        source_directory=model_train_step_path,
        entry_script=model_train_settings.get("script_name", None),
        environment_variables=model_train_settings.get("parameters", None),
        compute_target=model_train_compute_target,
        node_count=model_train_settings.get("node_count", None),
        distributed_training=distributed_training_backend,
        conda_packages=model_train_settings.get("conda_packages", None),
        pip_packages=model_train_settings.get("pip_packages", None),
    )

    try:
        # Create parameter sampling
        print("Creating Parameter Sampling")
        parameter_dict = {}
        parameters = hyperparameter_sampling_settings.get(
            "parameters",
            {}) if "parameters" in hyperparameter_sampling_settings else {}
        for parameter_name, parameter_details in parameters.items():
            parameter_distr = get_parameter_distribution(
                distribution=parameter_details.get("distribution", None),
                **parameter_details.get("settings", {}))
            parameter_dict[f"--{parameter_name}"] = parameter_distr
        model_train_ps = get_parameter_sampling(
            sampling_method=hyperparameter_sampling_settings.get(
                "method", None),
            parameter_dict=parameter_dict)

        # Get Policy definition
        policy_settings = hyperparameter_sampling_settings.get("policy", {})
        kwargs = {
            key: value
            for key, value in policy_settings.items() if key not in
            ["policy_method", "evaluation_interval", "delay_evaluation"]
        }

        # Create termination policy
        print("Creating early termination policy")
        model_train_policy = get_policy(
            policy_method=policy_settings.get("method", ""),
            evaluation_interval=policy_settings.get("evaluation_interval",
                                                    None),
            delay_evaluation=policy_settings.get("delay_evaluation", None),
            **kwargs)

        # Create HyperDriveConfig
        print("Creating HyperDriveConfig")
        model_train_hyperdrive_config = HyperDriveConfig(
            estimator=model_train_estimator,
            hyperparameter_sampling=model_train_ps,
            policy=model_train_policy,
            primary_metric_name=hyperparameter_sampling_settings.get(
                "primary_metric", None),
            primary_metric_goal=PrimaryMetricGoal.MINIMIZE
            if "min" in hyperparameter_sampling_settings.get(
                "primary_metric_goal", None) else PrimaryMetricGoal.MAXIMIZE,
            max_total_runs=hyperparameter_sampling_settings.get(
                "max_total_runs", 1),
            max_concurrent_runs=hyperparameter_sampling_settings.get(
                "max_concurrent_runs", 1),
            max_duration_minutes=hyperparameter_sampling_settings.get(
                "max_duration_minutes", None))

        # Create HyperDriveStep
        print("Creating HyperDriveStep")
        model_train = HyperDriveStep(
            name=model_train_settings.get("step_name", None),
            hyperdrive_config=model_train_hyperdrive_config,
            estimator_entry_script_arguments=model_train_settings.get(
                "arguments", None),
            inputs=[model_train_input],
            outputs=[model_train_output],
            allow_reuse=model_train_settings.get("allow_reuse", True),
            version=model_train_settings.get("version", True))
    except:
        print("Not all required parameters specified for HyperDrive step")

        # Create EstimatorStep
        print("Creating EstimatorStep")
        model_train = EstimatorStep(
            name=model_train_settings.get("step_name", None),
            estimator=model_train_estimator,
            estimator_entry_script_arguments=model_train_settings.get(
                "arguments", None),
            inputs=[model_train_input],
            outputs=[model_train_output],
            compute_target=model_train_compute_target,
            allow_reuse=model_train_settings.get("allow_reuse", True),
            version=model_train_settings.get("version", True))

    #########################
    ### Creating Pipeline ###
    #########################

    # Create Pipeline
    print("Creating Pipeline")
    pipeline = Pipeline(
        workspace=workspace,
        steps=[model_train],
        description="Training Pipeline",
    )

    # Validate pipeline
    print("Validating pipeline")
    pipeline.validate()

    return pipeline
示例#18
0
    parser.add_argument('--subscription_id', help='the subscription id of aml')
    parser.add_argument('--resource_group', help='the resource group of aml')
    parser.add_argument('--workspace_name', help='the workspace name of aml')
    parser.add_argument('--compute_target',
                        help='the compute cluster name of aml')
    parser.add_argument('--docker_image', help='the docker image of job')
    parser.add_argument('--experiment_name', help='the experiment name')
    parser.add_argument('--script_dir', help='script directory')
    parser.add_argument('--script_name', help='script name')
    args = parser.parse_args()

    ws = Workspace(args.subscription_id, args.resource_group,
                   args.workspace_name)
    compute_target = ComputeTarget(workspace=ws, name=args.compute_target)
    experiment = Experiment(ws, args.experiment_name)
    run_config = RunConfiguration()
    dependencies = CondaDependencies()
    dependencies.add_pip_package("azureml-sdk")
    dependencies.add_pip_package("azureml")
    run_config.environment.python.conda_dependencies = dependencies
    run_config.environment.docker.enabled = True
    run_config.environment.docker.base_image = args.docker_image
    run_config.target = compute_target
    run_config.node_count = 1
    config = ScriptRunConfig(source_directory=args.script_dir,
                             script=args.script_name,
                             run_config=run_config)
    run = experiment.submit(config)
    print(run.get_details()["runId"])
    while True:
        line = sys.stdin.readline().rstrip()
示例#19
0
import time 
from azureml.core import ScriptRunConfig, RunConfiguration
from azureml.core import Workspace, Experiment 

ws = Workspace.from_config(path = './aml_config/PredictiveMaintenanceWSConfig.json') 

exp = Experiment(name = 'TrainModel', workspace = ws) 

#run_config = RunConfiguration.load(name = 'local', path = '.') 
run_config = RunConfiguration.load(name = 'amlcompute', path = '.') 
#run_config = RunConfiguration.load(name = 'cluster', path = '.') # `cluster` Compute Target should be created within Azure ML Workspace 

print(run_config) 

script_run_config = ScriptRunConfig(source_directory = '.', script = 'train.py', run_config = run_config) 

run = exp.submit(script_run_config) 

print(run.get_portal_url())

run.log('Starting Submission', time.asctime(time.localtime(time.time()))) 

run.wait_for_completion(show_output = True) 
示例#20
0
                    type=str,
                    help="Path to model training code",
                    dest="source_directory",
                    required=True)

args = parser.parse_args()
print(f'Arguments: {args}')

print('Connecting to workspace')
ws = Workspace.from_config()
print(
    f'WS name: {ws.name}\nRegion: {ws.location}\nSubscription id: {ws.subscription_id}\nResource group: {ws.resource_group}'
)

print('Loading runconfig for pipeline')
runconfig = RunConfiguration.load(args.runconfig)
runconfig_register = RunConfiguration.load(args.runconfig_register)

print('Loading dataset')
training_dataset = Dataset.get_by_name(ws, args.dataset)

# Parametrize dataset input to the pipeline
training_dataset_parameter = PipelineParameter(name="training_dataset",
                                               default_value=training_dataset)
training_dataset_consumption = DatasetConsumptionConfig(
    "training_dataset", training_dataset_parameter).as_mount()

train_step = PythonScriptStep(
    name="train-step",
    runconfig=runconfig,
    source_directory=args.source_directory,
示例#21
0
def create_pipeline():
    ws = Workspace.from_config(auth=authenticate())
    def_data_store = ws.get_default_datastore()
    run = Run.get_context()

    project_folder = "project"

    read_output = PipelineData("read_output",
                               datastore=def_data_store,
                               output_name="read_output")
    process_out = PipelineData("process_out",
                               datastore=def_data_store,
                               output_name="process_out")

    # hist, line, scatter
    chart_type = PipelineParameter(name="chart_type", default_value="line")

    # Check if compute exist
    compute_name = "Dedicated-DS3-v2"
    vm_size = "STANDARD_D3_V2"
    if compute_name in ws.compute_targets:
        compute_target = ws.compute_targets[compute_name]
        if compute_target and type(compute_target) is AmlCompute:
            print('Found compute target: ' + compute_name)
    else:
        # create the compute target
        print('Creating a new compute target...')
        provisioning_config = AmlCompute.provisioning_configuration(
            vm_size=vm_size, min_nodes=0, max_nodes=4)
        compute_target = ComputeTarget.create(ws, compute_name,
                                              provisioning_config)
        compute_target.wait_for_completion(show_output=True,
                                           min_node_count=None,
                                           timeout_in_minutes=20)

    # create run config for our python steps
    def conda_deps():
        deps = CondaDependencies(f'{project_folder}/environment.yml')
        deps.add_channel("conda-forge")
        deps.add_conda_package('curl')
        return deps

    run_config = RunConfiguration(conda_dependencies=conda_deps())
    run_config.environment.docker.enabled = True
    run_config.environment.spark.precache_packages = False

    # Create each step for our pipeline
    read_data = PythonScriptStep(
        name="read_data",
        script_name="read_data.py",
        arguments=["read-data", "--output-path", read_output],
        outputs=[read_output],
        compute_target=compute_target,
        source_directory=project_folder,
        runconfig=run_config)

    pre_process = PythonScriptStep(name="pre_process",
                                   script_name="pre_process.py",
                                   arguments=[
                                       "pre-process", "--input-path",
                                       read_output, "--output-path",
                                       process_out
                                   ],
                                   inputs=[read_output],
                                   outputs=[process_out],
                                   compute_target=compute_target,
                                   source_directory=project_folder,
                                   runconfig=run_config)

    visualize = PythonScriptStep(name="visualize",
                                 script_name="visualize.py",
                                 arguments=[
                                     "visualize", "--input-path", process_out,
                                     "--chart", chart_type
                                 ],
                                 inputs=[process_out],
                                 compute_target=compute_target,
                                 source_directory=project_folder,
                                 runconfig=run_config)

    # list of steps to run
    steps = [read_data, pre_process, visualize]

    # Build the pipeline
    test_pipeline = Pipeline(workspace=ws, steps=[steps])

    # Submit the pipeline to be run - In the same experiment
    pipeline_run = run.experiment.submit(test_pipeline)
    pipeline_run.wait_for_completion()
def get_pipeline(aml_compute: ComputeTarget, blob_ds: Datastore,
                 batch_env: Environment, tf_env: Environment) -> str:
    """
    Creates pipeline steps
    Parameters:
        aml_compute (ComputeTarget): a reference to a compute
        blob_ds (DataStore): a reference to a datastore
        batch_env (Environment): a reference to environment object
        tf_env (Environment): a horovod/tf environment
    Returns:
        string: a set of pipeline steps
    """

    # We need something to generate data by the way
    pipeline_files = PipelineData("pipeline_files",
                                  datastore=blob_ds).as_dataset()

    # Pipeline parameters to use with every run
    is_debug = PipelineParameter("is_debug", default_value=False)
    relay_connection_name = PipelineParameter("debug_relay_connection_name",
                                              default_value="none")

    single_step_config = RunConfiguration()
    single_step_config.environment = batch_env
    single_step = PythonScriptStep(
        name=f"single-step",
        script_name="samples/azure_ml_advanced/steps/single_step.py",
        source_directory=".",
        runconfig=single_step_config,
        arguments=[
            "--pipeline-files", pipeline_files, "--is-debug", is_debug,
            "--debug-relay-connection-name", relay_connection_name,
            "--debug-port", 5678, "--debug-relay-connection-string-secret",
            debug_connection_string_secret_name
        ],
        inputs=[],
        outputs=[pipeline_files],
        compute_target=aml_compute,
        allow_reuse=False)

    output_dir = PipelineData("output_dir")

    parallel_run_config = ParallelRunConfig(
        entry_script="samples/azure_ml_advanced/steps/parallel_step.py",
        source_directory=".",
        mini_batch_size="5",
        output_action="summary_only",
        environment=batch_env,
        compute_target=aml_compute,
        error_threshold=10,
        run_invocation_timeout=600,  # very important for debugging
        node_count=2,
        process_count_per_node=1)

    parallelrun_step = ParallelRunStep(
        name="parallel-run-step",
        parallel_run_config=parallel_run_config,
        inputs=[pipeline_files],
        output=output_dir,
        arguments=[
            "--is-debug", is_debug, "--debug-relay-connection-name",
            relay_connection_name, "--debug-port", 5679,
            "--debug-relay-connection-string-secret",
            debug_connection_string_secret_name
        ],
        allow_reuse=False)

    parallelrun_step.run_after(single_step)

    distr_config = MpiConfiguration(process_count_per_node=1, node_count=2)

    src = ScriptRunConfig(
        source_directory=".",
        script="samples/azure_ml_advanced/steps/mpi/mpi_step_starter.py",
        arguments=[
            "--input-ds", pipeline_files, "--is-debug", is_debug,
            "--debug-relay-connection-name", relay_connection_name,
            "--debug-port", 5680, "--debug-relay-connection-string-secret",
            debug_connection_string_secret_name
        ],
        compute_target=compute_name,
        environment=tf_env,
        distributed_job_config=distr_config,
    )

    mpi_step = PythonScriptStep(
        name="mpi-step",
        script_name="samples/azure_ml_advanced/steps/mpi/mpi_step_starter.py",
        arguments=[
            "--input-ds", pipeline_files, "--is-debug", is_debug,
            "--debug-relay-connection-name", relay_connection_name,
            "--debug-port", 5680, "--debug-relay-connection-string-secret",
            debug_connection_string_secret_name
        ],
        compute_target=aml_compute,
        inputs=[pipeline_files],
        outputs=[],
        runconfig=src.run_config,
        source_directory=".")

    mpi_step.run_after(parallelrun_step)

    print("Pipeline Steps Created")

    steps = [single_step, parallelrun_step, mpi_step]

    print(f"Returning {len(steps)} steps")
    return steps
示例#23
0
def build_pipeline_steps(automlconfig: AutoMLConfig,
                         data: Dataset,
                         target_column: str,
                         compute_target: ComputeTarget,
                         group_column_names: list,
                         time_column_name: str,
                         deploy: bool,
                         service_name: str = 'grouping-demo') -> StepSequence:
    steps = []

    metrics_output_name = 'metrics_{}'
    best_model_output_name = 'best_model_{}'
    count = 0
    model_names = []

    # get all automl configs by group
    configs = _get_configs(automlconfig, data, target_column, compute_target, group_column_names)

    # build a runconfig for register model
    register_config = RunConfiguration()
    cd = CondaDependencies()
    cd.add_pip_package('azureml-pipeline')
    register_config.environment.python.conda_dependencies = cd

    # create each automl step end-to-end (train, register)
    for group_name, conf in configs.items():
        # create automl metrics output
        metrics_data = PipelineData(
            name='metrics_data_{}'.format(group_name),
            pipeline_output_name=metrics_output_name.format(group_name),
            training_output=TrainingOutput(type='Metrics'))
        # create automl model output
        model_data = PipelineData(
            name='model_data_{}'.format(group_name),
            pipeline_output_name=best_model_output_name.format(group_name),
            training_output=TrainingOutput(type='Model', metric=conf.user_settings['primary_metric']))

        automl_step = AutoMLStep(
            name='automl_{}'.format(group_name),
            automl_config=conf,
            outputs=[metrics_data, model_data],
            allow_reuse=True)
        steps.append(automl_step)

        # pass the group name as a parameter to the register step ->
        # this will become the name of the model for this group.
        group_name_param = PipelineParameter("group_name_{}".format(count), default_value=group_name)
        count += 1

        reg_model_step = PythonScriptStep(
            'register.py',
            name='register_{}'.format(group_name),
            arguments=["--model_name", group_name_param, "--model_path", model_data],
            inputs=[model_data],
            compute_target=compute_target,
            runconfig=register_config,
            source_directory="register",
            allow_reuse=True
        )
        steps.append(reg_model_step)
        model_names.append(group_name)

    final_steps = steps
    if deploy:
        # modify the conda dependencies to ensure we pick up correct
        # versions of azureml-defaults and azureml-train-automl
        cd = CondaDependencies.create(pip_packages=['azureml-defaults', 'azureml-train-automl'])
        automl_deps = CondaDependencies(conda_dependencies_file_path='deploy/myenv.yml')
        cd._merge_dependencies(automl_deps)
        cd.save('deploy/myenv.yml')

        # add deployment step
        pp_group_column_names = PipelineParameter(
            "group_column_names",
            default_value="#####".join(list(reversed(group_column_names))))

        pp_model_names = PipelineParameter(
            "model_names",
            default_value=json.dumps(model_names))

        pp_service_name = PipelineParameter(
            "service_name",
            default_value=service_name)

        deployment_step = PythonScriptStep(
            'deploy.py',
            name='service_deploy',
            arguments=["--group_column_names", pp_group_column_names,
                       "--model_names", pp_model_names,
                       "--service_name", pp_service_name,
                       "--time_column_name", time_column_name],
            compute_target=compute_target,
            runconfig=RunConfiguration(),
            source_directory="deploy"
        )
        final_steps = StepSequence(steps=[steps, deployment_step])

    return final_steps
示例#24
0
# Connect to the workspace
ws = Workspace.from_config()
print(f'WS name: {ws.name}')
print(f'Region: {ws.location}')
print(f'Subscription id: {ws.subscription_id}')
print(f'Resource group: {ws.resource_group}')

default_training_dataset = Dataset.get_by_name(ws, default_dataset_name)

# Parametrize dataset input to the pipeline
training_dataset_parameter = PipelineParameter(name='training_dataset', default_value=default_training_dataset)
training_dataset_consumption = DatasetConsumptionConfig('training_dataset', training_dataset_parameter).as_download()

# Load runconfig from earlier exercise and create pipeline
runconfig = RunConfiguration.load(os.path.join(source_directory, 'runconfig.yml'))

train_step = PythonScriptStep(name='train-step',
                        source_directory=source_directory,
                        script_name='train.py',
                        arguments=['--data-path', training_dataset_consumption],
                        inputs=[training_dataset_consumption],
                        runconfig=runconfig,
                        allow_reuse=False)

steps = [train_step]

pipeline = Pipeline(workspace=ws, steps=steps)
pipeline.validate()
published_pipeline = pipeline.publish('training-pipeline')
def get_backtest_pipeline(
    experiment: Experiment,
    dataset: TabularDataset,
    process_per_node: int,
    node_count: int,
    compute_target: ComputeTarget,
    automl_settings: Dict[str, Any],
    step_size: int,
    step_number: int,
    model_name: Optional[str] = None,
    model_uid: Optional[str] = None,
) -> Pipeline:
    """
    :param experiment: The experiment used to run the pipeline.
    :param dataset: Tabular data set to be used for model training.
    :param process_per_node: The number of processes per node. Generally it should be the number of cores
                             on the node divided by two.
    :param node_count: The number of nodes to be used.
    :param compute_target: The compute target to be used to run the pipeline.
    :param model_name: The name of a model to be back tested.
    :param automl_settings: The dictionary with automl settings.
    :param step_size: The number of periods to step back in backtesting.
    :param step_number: The number of backtesting iterations.
    :param model_uid: The uid to mark models from this run of the experiment.
    :return: The pipeline to be used for model retraining.
             **Note:** The output will be uploaded in the pipeline output
             called 'score'.
    """
    jasmine_client = JasmineClient(
        service_context=experiment.workspace.service_context,
        experiment_name=experiment.name,
        experiment_id=experiment.id,
    )
    env = jasmine_client.get_curated_environment(
        scenario=Scenarios.AUTOML,
        enable_dnn=False,
        enable_gpu=False,
        compute=compute_target,
        compute_sku=experiment.workspace.compute_targets.get(
            compute_target.name
        ).vm_size,
    )
    data_results = PipelineData(
        name="results", datastore=None, pipeline_output_name="results"
    )
    ############################################################
    # Split the data set using python script.
    ############################################################
    run_config = RunConfiguration()
    run_config.docker.use_docker = True
    run_config.environment = env

    utilities.set_environment_variables_for_run(run_config)

    split_data = PipelineData(name="split_data_output", datastore=None).as_dataset()
    split_step = PythonScriptStep(
        name="split_data_for_backtest",
        script_name="data_split.py",
        inputs=[dataset.as_named_input("training_data")],
        outputs=[split_data],
        source_directory=PROJECT_FOLDER,
        arguments=[
            "--step-size",
            step_size,
            "--step-number",
            step_number,
            "--time-column-name",
            automl_settings.get("time_column_name"),
            "--time-series-id-column-names",
            automl_settings.get("grain_column_names"),
            "--output-dir",
            split_data,
        ],
        runconfig=run_config,
        compute_target=compute_target,
        allow_reuse=False,
    )
    ############################################################
    # We will do the backtest the parallel run step.
    ############################################################
    settings_path = os.path.join(PROJECT_FOLDER, SETTINGS_FILE)
    hru.dump_object_to_json(automl_settings, settings_path)
    mini_batch_size = PipelineParameter(name="batch_size_param", default_value=str(1))
    back_test_config = ParallelRunConfig(
        source_directory=PROJECT_FOLDER,
        entry_script="retrain_models.py",
        mini_batch_size=mini_batch_size,
        error_threshold=-1,
        output_action="append_row",
        append_row_file_name="outputs.txt",
        compute_target=compute_target,
        environment=env,
        process_count_per_node=process_per_node,
        run_invocation_timeout=3600,
        node_count=node_count,
    )
    utilities.set_environment_variables_for_run(back_test_config)
    forecasts = PipelineData(name="forecasts", datastore=None)
    if model_name:
        parallel_step_name = "{}-backtest".format(model_name.replace("_", "-"))
    else:
        parallel_step_name = "AutoML-backtest"

    prs_args = [
        "--target_column_name",
        automl_settings.get("label_column_name"),
        "--output-dir",
        forecasts,
    ]
    if model_name is not None:
        prs_args.append("--model-name")
        prs_args.append(model_name)
    if model_uid is not None:
        prs_args.append("--model-uid")
        prs_args.append(model_uid)
    backtest_prs = ParallelRunStep(
        name=parallel_step_name,
        parallel_run_config=back_test_config,
        arguments=prs_args,
        inputs=[split_data],
        output=forecasts,
        allow_reuse=False,
    )
    ############################################################
    # Then we collect the output and return it as scores output.
    ############################################################
    collection_step = PythonScriptStep(
        name="score",
        script_name="score.py",
        inputs=[forecasts.as_mount()],
        outputs=[data_results],
        source_directory=PROJECT_FOLDER,
        arguments=["--forecasts", forecasts, "--output-dir", data_results],
        runconfig=run_config,
        compute_target=compute_target,
        allow_reuse=False,
    )
    # Build and return the pipeline.
    return Pipeline(
        workspace=experiment.workspace,
        steps=[split_step, backtest_prs, collection_step],
    )
示例#26
0
def submit_run(
        experiment=None,
        path=None,
        run_configuration_name=None,
        source_directory=None,
        conda_dependencies=None,
        run_async=None,
        ct_name=None,
        user_script_and_arguments=None,
        logger=None):

    from azureml.core import RunConfiguration, ScriptRunConfig

    if user_script_and_arguments and len(user_script_and_arguments) > 0:
        script, arguments = user_script_and_arguments[0], user_script_and_arguments[1:]
    else:
        script, arguments = None, None

    if run_configuration_name is None:
        logger.info("No Run Configuration name provided, using default: local system-managed")
        run_config = RunConfiguration()
    else:
        run_config = RunConfiguration.load(path, run_configuration_name)

    if conda_dependencies:
        from azureml.core.conda_dependencies import CondaDependencies
        cd = CondaDependencies(conda_dependencies_file_path=conda_dependencies)
        run_config.environment.python.conda_dependencies = cd

    if not run_config.script and not script:
        raise UserErrorException("Please specify the script to run either via parameter or in the runconfig")

    if run_config.script and script:
        logger.info("Overriding runconfig script %s with script argument %s", run_config.script, script)

    if script:
        run_config.script = script

    if run_config.arguments and arguments:
        logger.info("Overriding runconfig arguments %s with  %s", run_config.arguments, arguments)

    if arguments:
        run_config.arguments = arguments

    if ct_name:
        run_config.target = ct_name

    # default to path if source directory is missing.
    if source_directory is None:
        source_directory = path

    logger.info("Running %s with arguments %s", run_config.script, run_config.arguments)
    script_run_config = ScriptRunConfig(source_directory=source_directory,
                                        run_config=run_config,
                                        arguments=run_config.arguments)

    run = experiment.submit(script_run_config)

    logger.debug("Running asynchronously: %s", run_async)
    if not run_async:
        run.wait_for_completion(show_output=True, wait_post_processing=True)

    return _run_to_output_dict(run)
示例#27
0
prepared_dataset = PipelineData(prepared_data_dir, datastore=datastore).as_dataset()
prepared_dataset = prepared_dataset.register(name=prepared_data_dir)

conda = CondaDependencies.create(
    pip_packages=[
        "azureml-sdk",
        "azureml-dataprep[fuse,pandas]",
        "torch==1.5.0",
        "nlp==0.2.0",
        "transformers==2.11.0",
    ],
    pin_sdk_version=False,
)
conda.set_pip_option("--pre")
run_config = RunConfiguration()
run_config.environment.python.conda_dependencies = conda

# Define Pipeline Parameters
model_name_param = PipelineParameter("model_name_or_path", "bert-base-cased")
max_seq_len_param = PipelineParameter("max_seq_length", 128)
task_param = PipelineParameter("task", "mrpc")
learning_rate_param = PipelineParameter("learning_rate", 2e-5)
seed_param = PipelineParameter("seed", 1)
train_batch_size_param = PipelineParameter("train_batch_size", 64)
eval_batch_size_param = PipelineParameter("eval_batch_size", 64)
max_epochs_param = PipelineParameter("max_epochs", 3)
num_gpus_param = PipelineParameter("gpus", 2)
num_workers_param = PipelineParameter("num_workers", 2)

示例#28
0
parser.add_argument("--source_directory",
                    type=str,
                    help="Path to model training code",
                    dest="source_directory",
                    required=True)
args = parser.parse_args()
print(f'Arguments: {args}')

print('Connecting to workspace')
ws = Workspace.from_config()
print(
    f'WS name: {ws.name}\nRegion: {ws.location}\nSubscription id: {ws.subscription_id}\nResource group: {ws.resource_group}'
)

print('Loading runconfig for pipeline')
runconfig = RunConfiguration.load(args.runconfig)

print('Loading dataset')
training_dataset = Dataset.get_by_name(ws, args.dataset)

# Parametrize dataset input to the pipeline
training_dataset_parameter = PipelineParameter(name="training_dataset",
                                               default_value=training_dataset)
training_dataset_consumption = DatasetConsumptionConfig(
    "training_dataset", training_dataset_parameter).as_mount()

pipeline_data = PipelineData("pipeline_data",
                             datastore=ws.get_default_datastore())
model_name_param = PipelineParameter(name="model_name",
                                     default_value="model.pkl")
示例#29
0
    'POSTGRES_PASSWORD':
    os.environ['POSTGRES_PASSWORD'],
    'POSTGRES_HOSTNAME':
    'ackbar-postgres.postgres.database.azure.com',
    'AZURE_STORAGE_CONNECTION_STRING':
    os.environ['AZURE_STORAGE_CONNECTION_STRING']
}
env = Environment(name='env', environment_variables=environment_variables)
conda = CondaDependencies()
conda.add_conda_package('psycopg2')
conda.add_conda_package('numpy')
conda.add_conda_package('Pillow')
# have to use pip to install azure packages...
conda.add_pip_package('azure-storage-blob')
env.python.conda_dependencies = conda
run_config = RunConfiguration()
run_config.environment = env

PROJECT = 'caltech'

prepare_step = PythonScriptStep(
    script_name='prepare.py',
    arguments=['--output', batch_input, '--project', PROJECT],
    inputs=[],
    outputs=[batch_input],
    compute_target=compute_target,
    source_directory='pipeline',
    runconfig=run_config,
    params=environment_variables,
)
from azureml.pipeline.core import PipelineData
is_directory = False  # it's a file where we save the prepared dataframe
default_datastore = ws.get_default_datastore()
datapreparation_output = PipelineData('datapreparation_output',
                                      datastore=default_datastore,
                                      is_directory=is_directory)

# 1.B) Create the dependency object with mlextend package https://docs.microsoft.com/en-us/python/api/azureml-core/azureml.core.conda_dependencies.condadependencies?view=azure-ml-py:
from azureml.core.environment import CondaDependencies
conda_dep_prep = CondaDependencies()
conda_dep_prep.add_pip_package(
    "mlxtend==0.17.2")  # or conda_dep.add_conda_package("mlxtend==0.17.2")

# 1.C) Create the RunConfiguration object:
from azureml.core import RunConfiguration
run_config_prep = RunConfiguration(conda_dependencies=conda_dep_prep)

# 1.D) Create the PythonScriptStep
from azureml.pipeline.steps import PythonScriptStep
data_preparation_step = PythonScriptStep(
    name="1: Data preparation",
    script_name="1-data_preparation.py",
    compute_target=compute_target,
    runconfig=run_config_prep,
    arguments=[
        "--remoteDataFolder", remote_data_folder, "--localDataFolder",
        local_data_to_download_folder, "--datapreparation_output",
        datapreparation_output, "--is_directory", 'aaa' if is_directory else ''
    ],  #  All non-empty strings have a True boolean value
    outputs=[datapreparation_output],
    source_directory='./local_scripts/')