def update_io(inputs, outputs): for key, value in inputs.items(): if isinstance(value, _Dataset): raise UserErrorException( "Dataset cannot be used without providing a name for the run. Please provide " "a name by calling the as_named_input instance method on dataset." ) elif isinstance(value, DatasetConsumptionConfig): value.dataset._ensure_saved(workspace) inputs[key] = Data.create(value) input_data.append(value) # Set the environment variable for mount validation if value.dataset._consume_latest: env_vars = run_config.environment.environment_variables if _SKIP_VALIDATE_DATASETS not in env_vars: env_vars[_SKIP_VALIDATE_DATASETS] = value.name else: env_vars[_SKIP_VALIDATE_DATASETS] = ",".join( [env_vars[_SKIP_VALIDATE_DATASETS], value.name]) elif isinstance(value, Data): input_data.append(value) else: raise UserErrorException("{} cannot be used as input.".format( type(value).__name__)) for key, value in outputs.items(): if isinstance(value, OutputDatasetConfig): outputs[key] = output_data[key] = value._to_output_data() elif isinstance(value, OutputData): output_data[key] = value else: raise UserErrorException("{} cannot be used as output.".format( type(value).__name__))
def load_data(dataset, input_name): data = Data( data_location=DataLocation(dataset=RunDataset(dataset_id=dataset.id)), create_output_directories=False, mechanism='mount', environment_variable_name=input_name, overwrite=True) return data
def _to_input_config(config): from azureml.core.runconfig import Data, DataLocation, Dataset data_location_json = config.get("DataLocation", None) dataset_json = data_location_json.get( "Dataset", None) if data_location_json else None dataset_id = dataset_json.get("Id") if dataset_json else None dataset_name = dataset_json.get("Name") if dataset_json else None dataset_version = dataset_json.get("Version") if dataset_json else None dataset = Dataset(dataset_id=dataset_id, dataset_name=dataset_name, dataset_version=dataset_version) data_location = DataLocation(dataset=dataset) create_output_directories = config.get("CreateOutputDirectories", False) mechanism = config.get("Mechanism", None).lower() environment_variable_name = config.get("EnvironmentVariableName", None) path_on_compute = config.get("PathOnCompute", None) overwrite = config.get("Overwrite", False) return Data(data_location=data_location, create_output_directories=create_output_directories, mechanism=mechanism, environment_variable_name=environment_variable_name, path_on_compute=path_on_compute, overwrite=overwrite)
compute_name = 'cpu-cluster' # Define the script run config src = ScriptRunConfig( source_directory='scripts', script='train.py', arguments=[ '--data-folder', 'DatasetConsumptionConfig:{}'.format(input_name) ]) # Define the data section of the runconfig src.run_config.data = { input_name: Data( data_location=DataLocation( dataset=RunDataset(dataset_id=dataset.id)), create_output_directories=False, mechanism='mount', environment_variable_name=input_name, overwrite=False ) } # Set other parameters for the run src.run_config.framework = 'python' src.run_config.environment = conda_env src.run_config.target = compute_name src.run_config.node_count = 4 # Save the run configuration as a .azureml/mnist.runconfig get_run_config_from_script_run(src).save(name='mnist.runconfig')