def as_input(self, name=None): """Specify how to consume the output as an input in subsequent pipeline steps. :param name: The name of the input specific to the run. :type name: str :return: A :class:`azureml.data.dataset_consumption_config.DatasetConsumptionConfig` instance describing how to deliver the input data. :rtype: azureml.data.dataset_consumption_config.DatasetConsumptionConfig """ from azureml.data.dataset_consumption_config import DatasetConsumptionConfig name = name or self.__class__._generate_random_name('input') return DatasetConsumptionConfig(name, self, DIRECT_MODE)
def as_named_input(self, name): """Provide a name for this dataset which will be used to retrieve the materialized dataset in the run. .. remarks:: The name here will only be applicable inside an Azure Machine Learning run. The name must only contain alphanumeric and underscore characters so it can be made available as an environment variable. You can use this name to retrieve the dataset in the context of a run using two approaches: * Environment Variable: The name will be the environment variable name and the materialized dataset will be made available as the value of the environment variable. If the dataset is downloaded or mounted, the value will be the downloaded/mounted path. For example: .. code-block:: python # in your job submission notebook/script: dataset.as_named_input('foo').as_download('/tmp/dataset') # in the script that will be executed in the run import os path = os.environ['foo'] # path will be /tmp/dataset .. note:: If the dataset is set to direct mode, then the value will be the dataset ID. You can then retrieve the dataset object by doing `Dataset.get_by_id(os.environ['foo'])` * Run.input_datasets: This is a dictionary where the key will be the dataset name you specified in this method and the value will be the materialized dataset. For downloaded and mounted dataset, the value will be the downloaded/mounted path. For direct mode, the value will be the same dataset object you specified in your job submission script. .. code-block:: python # in your job submission notebook/script: dataset.as_named_input('foo') # direct mode # in the script that will be executed in the run run = Run.get_context() run.input_datasets['foo'] # this returns the dataset object from above. :param name: The name of the dataset for the run. :type name: str :return: The configuration object describing how the Dataset should be materialized in the run. :rtype: azureml.data.dataset_consumption_config.DatasetConsumptionConfig """ return DatasetConsumptionConfig(name, self)
from azureml.data.dataset_consumption_config import DatasetConsumptionConfig from azureml.pipeline.steps import ParallelRunConfig from azureml.pipeline.steps import ParallelRunStep print("SDK version:", azureml.core.VERSION) dataset_name = 'grib-dataset' ws = Workspace.from_config() print(ws.name, ws.resource_group, ws.location, ws.subscription_id, sep='\n') datastore = ws.get_default_datastore() input_ds = Dataset.get_by_name(ws, dataset_name) batch_data = DatasetConsumptionConfig("batch_dataset", input_ds, mode='mount') output_dir = PipelineData(name='batch_output', datastore=datastore) parallel_run_config = ParallelRunConfig.load_yaml(workspace=ws, path='convert_parallel.yml') batch_step = ParallelRunStep(name="batch-conversion-step", parallel_run_config=parallel_run_config, arguments=['--data_output_path', output_dir], inputs=[batch_data], output=output_dir, allow_reuse=False) steps = [batch_step]
ws = Workspace.from_config() print( f'WS name: {ws.name}\nRegion: {ws.location}\nSubscription id: {ws.subscription_id}\nResource group: {ws.resource_group}' ) print('Loading parallel runconfig for pipeline') parallel_run_config = ParallelRunConfig.load_yaml(workspace=ws, path=args.runconfig) print('Loading default batch dataset') batch_dataset = Dataset.get_by_name(ws, args.dataset) # Parametrize dataset input and dataset output name (batch scoring result) to the pipeline batch_dataset_parameter = PipelineParameter(name="batch_dataset", default_value=batch_dataset) batch_dataset_consumption = DatasetConsumptionConfig( "batch_dataset", batch_dataset_parameter).as_mount() datastore = ws.get_default_datastore() output_dataset_name = "batch_scoring_results" # Existing, GA-code - does not allow to specify the path on the datastore # output_dataset = PipelineData(name='batch_output', datastore=datastore).as_dataset() # output_dataset = output_dataset.register(name=output_dataset_name, create_new_version=True) # New code, not GA - does allow to specify the path on the datstore destination_on_datastore = (datastore, 'output_dataset_name/') output_dataset = OutputFileDatasetConfig( name='batch_results', destination=destination_on_datastore).register_on_complete( name=output_dataset_name)
ws = Workspace.from_config() print( f'WS name: {ws.name}\nRegion: {ws.location}\nSubscription id: {ws.subscription_id}\nResource group: {ws.resource_group}' ) print('Loading runconfig for pipeline') runconfig = RunConfiguration.load(args.runconfig) runconfig_register = RunConfiguration.load(args.runconfig_register) print('Loading dataset') training_dataset = Dataset.get_by_name(ws, args.dataset) # Parametrize dataset input to the pipeline training_dataset_parameter = PipelineParameter(name="training_dataset", default_value=training_dataset) training_dataset_consumption = DatasetConsumptionConfig( "training_dataset", training_dataset_parameter).as_mount() train_step = PythonScriptStep( name="train-step", runconfig=runconfig, source_directory=args.source_directory, script_name=runconfig.script, arguments=['--data_path', training_dataset_consumption], inputs=[training_dataset_consumption], allow_reuse=False) register_step = PythonScriptStep(name="register-step", runconfig=runconfig_register, source_directory=args.source_directory, arguments=['--model_name', args.model_name], script_name=runconfig_register.script,
default_dataset_name = 'german-credit-train-tutorial' print(f'Azure ML SDK version: {azureml.core.VERSION}') # Connect to the workspace ws = Workspace.from_config() print(f'WS name: {ws.name}') print(f'Region: {ws.location}') print(f'Subscription id: {ws.subscription_id}') print(f'Resource group: {ws.resource_group}') default_training_dataset = Dataset.get_by_name(ws, default_dataset_name) # Parametrize dataset input to the pipeline training_dataset_parameter = PipelineParameter(name='training_dataset', default_value=default_training_dataset) training_dataset_consumption = DatasetConsumptionConfig('training_dataset', training_dataset_parameter).as_download() # Load runconfig from earlier exercise and create pipeline runconfig = RunConfiguration.load(os.path.join(source_directory, 'runconfig.yml')) train_step = PythonScriptStep(name='train-step', source_directory=source_directory, script_name='train.py', arguments=['--data-path', training_dataset_consumption], inputs=[training_dataset_consumption], runconfig=runconfig, allow_reuse=False) steps = [train_step] pipeline = Pipeline(workspace=ws, steps=steps)
# can poll for a minimum number of nodes and for a specific timeout. # if no min node count is provided it uses the scale settings for the cluster compute_target.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20) # use get_status() to get a detailed status for the current cluster. print(compute_target.get_status().serialize()) def_data_store = ws.get_default_datastore() from azureml.data.dataset_consumption_config import DatasetConsumptionConfig from azureml.pipeline.core import PipelineParameter pipeline_param = PipelineParameter(name="mnist_param", default_value=dataset) input_mnist_ds_consumption = DatasetConsumptionConfig( "minist_param_config", pipeline_param).as_mount() from azureml.pipeline.core import Pipeline, PipelineData output_dir = PipelineData(name="inferences", datastore=def_data_store, output_path_on_compute="mnist/results") from azureml.core import Model from azureml.core.model import Model from azureml.core import Run from azureml.core.dataset import Dataset from azureml.core import Workspace, Dataset from azureml.core.authentication import ServicePrincipalAuthentication