def process_step(datastore: Datastore, compute: ComputeTarget, path_on_datastore: str) -> (PipelineData, EstimatorStep): datapath = DataPath(datastore=datastore, path_on_datastore=path_on_datastore) data_path_pipeline_param = (PipelineParameter(name="data", default_value=datapath), DataPathComputeBinding(mode='mount')) seer_tfrecords = PipelineData("tfrecords_set", datastore=datastore, is_directory=True) prep = Estimator(source_directory='.', compute_target=compute, entry_script='prep.py', pip_requirements_file='requirements.txt') prepStep = EstimatorStep(name='Data Preparation', estimator=prep, estimator_entry_script_arguments=[ "--source_path", data_path_pipeline_param, "--target_path", seer_tfrecords ], inputs=[data_path_pipeline_param], outputs=[seer_tfrecords], compute_target=compute) return seer_tfrecords, prepStep
# 2. Data Process Step # 3. Training Step # 4. Model Registration Step # 5. Pipeline registration # 6. Submit the pipeline for execution ## Pipeline Parameters ## # We need to tell the Pipeline what it needs to learn to see! source_dataset = DataPath( datastore=ds, path_on_datastore="simpsonslego-v3") source_dataset_param = (PipelineParameter(name="source_dataset",default_value=source_dataset), DataPathComputeBinding()) # Configuration for data prep and training steps # ## Data Process Step ## # prep.py file versions our data in our data source # # Output location for the pre-proccessed trainings images training_data_location = PipelineData(name="simpsons_training_data", datastore=ds) # Create the pre-process step preProcessDataStep = PythonScriptStep( name="Pre-process data", script_name="steps/prep.py", compute_target=cluster, inputs=[source_dataset_param],
# The following will be created and then run: # 1. Pipeline Parameters # 2. Data Process Step # 3. Training Step # 4. Model Registration Step # 5. Pipeline registration # 6. Submit the pipeline for execution ## Pipeline Parameters ## # We need to tell the Pipeline what it needs to learn to see! datapath = DataPath(datastore=datastore, path_on_datastore=datastorepath) data_path_pipeline_param = (PipelineParameter(name="data", default_value=datapath), DataPathComputeBinding(mode='mount')) # Configuration for data prep and training steps # dataprepEnvironment = Environment.from_pip_requirements( 'dataprepenv', 'requirements-dataprepandtraining.txt') dataprepRunConfig = RunConfiguration() dataprepRunConfig.environment = dataprepEnvironment ## Data Process Step ## # parse.py file parses the images in our data source # seer_tfrecords = PipelineData("tfrecords_set", datastore=datastore, is_directory=True)
def create_pipeline(workspace): # Retreive compute cluster compute_target = workspace.compute_targets[args.compute_target] # Setup batch scoring environment from conda dependencies environment = Environment.from_conda_specification( name=args.environment_name, file_path=args.environment_specification ) # Add environment variables environment.environment_variables = { "APPLICATIONINSIGHTS_CONNECTION_STRING": args.ai_connection_string } # Enable docker run environment.docker.enabled = True # Create run config run_config = RunConfiguration() run_config.environment = environment # Retreive input and output datastores input_datastore = Datastore(workspace, args.input_datastore_name) output_datastore = Datastore(workspace, args.output_datastore_name) # Define build id parameter build_id_param = PipelineParameter("build_id", default_value=args.build_id) # Define input datapath parameter input_datapath = DataPath(datastore=input_datastore, path_on_datastore="") input_datapath_param = ( PipelineParameter(name="input_datapath", default_value=input_datapath), DataPathComputeBinding(mode="mount"), ) # Define output datapath parameter output_datapath = DataPath(datastore=output_datastore, path_on_datastore="") output_datapath_param = ( PipelineParameter(name="output_datapath", default_value=output_datapath), DataPathComputeBinding(mode="mount"), ) # Define score step for pipeline score_step = PythonScriptStep( name="score_data", compute_target=compute_target, source_directory="src/score", script_name="score.py", inputs=[input_datapath_param, output_datapath_param], runconfig=run_config, allow_reuse=False, arguments=[ "--build_id", build_id_param, "--input_datapath", input_datapath_param, "--output_datapath", output_datapath_param, ], ) # Define pipeline for batch scoring pipeline = Pipeline(workspace=workspace, steps=[score_step]) return pipeline
environment = Environment.get(workspace, name="AzureML-Scikit-learn-0.20.3") environment.docker.enabled = True run_config = RunConfiguration() run_config.environment = environment compute_target = workspace.compute_targets["cpu"] run_config.target = compute_target train_features_datapath = DataPath( datastore=blobstore, path_on_datastore="training_set_features.csv") train_features_path_parameter = PipelineParameter( name="train_features", default_value=train_features_datapath) train_features_path = (train_features_path_parameter, DataPathComputeBinding(mode="mount")) train_labels_datapath = DataPath(datastore=blobstore, path_on_datastore="training_set_labels.csv") train_labels_path_parameter = PipelineParameter( name="train_labels", default_value=train_labels_datapath) train_labels_path = (train_labels_path_parameter, DataPathComputeBinding(mode="mount")) test_features_datapath = DataPath(datastore=blobstore, path_on_datastore="test_set_features.csv") test_features_path_parameter = PipelineParameter( name="test_features", default_value=test_features_datapath) test_features_path = (test_features_path_parameter, DataPathComputeBinding(mode="mount"))
def __init__(self, source_directory, script=None, arguments=None, run_config=None, _telemetry_values=None, compute_target=None, environment=None, distributed_job_config=None, resume_from=None, max_run_duration_seconds=MAX_DURATION_SECONDS_DEFAULT, command=None, docker_runtime_config=None): """Class ScriptRunConfig constructor. :param source_directory: A local directory containing code files needed for a run. :type source_directory: str :param script: The file path relative to the source_directory of the script to be run. :type script: str :param arguments: Optional command line arguments to pass to the training script. Arguments are passed in pairs, for example, ['--arg1', arg1_val, '--arg2', arg2_val]. :type arguments: builtin.list[str] :param run_config: Optional run configuration to use. :type run_config: azureml.core.runconfig.RunConfiguration :param _telemetry_values: Internal use only. :type _telemetry_values: dict :param compute_target: The compute target where training will happen. This can either be a ComputeTarget object, the name of an existing ComputeTarget, or the string "local". If no compute target is specified, your local machine will be used. :type compute_target: azureml.core.compute_target.AbstractComputeTarget or str :param environment: The environment to use for the run. If no environment is specified, azureml.core.runconfig.DEFAULT_CPU_IMAGE will be used as the Docker image for the run. :type environment: azureml.core.environment.Environment :param distributed_job_config: For jobs that require additional distributed job-specific configurations. :type distributed_job_config: azureml.core.runconfig.TensorflowConfiguration, azureml.core.runconfig.MpiConfiguration, or azureml.core.runconfig.PyTorchConfiguration :param resume_from: The DataPath containing the checkpoint or model files from which to resume the experiment. :type resume_from: azureml.data.datapath.DataPath :param max_run_duration_seconds: The maximum time allowed for the run. The system will attempt to automatically cancel the run if it took longer than this value. :type max_run_duration_seconds: int :param command: The command to be submitted for the run. The command property can also be used instead of script/arguments. Both command and script/argument properties cannot be used together to submit a run. To submit a script file using the command property - ['python', 'train.py', '--arg1', arg1_val] To run an actual command - ['ls'] :type command: builtin.list[str] or str :param docker_runtime_config: For jobs that require Docker runtime-specific configurations. :type docker_runtime_config: azureml.core.runconfig.DockerConfiguration """ self.source_directory = source_directory self.script = script self.command = command self.arguments = arguments if run_config: if (max_run_duration_seconds != ScriptRunConfig.MAX_DURATION_SECONDS_DEFAULT) or \ any([compute_target, environment, distributed_job_config, docker_runtime_config]): logging.warning( "If 'run_config' is specified, the following parameters will be " "ignored: 'compute_target', 'environment', 'distributed_job_config', " "'max_run_duration_seconds', and 'docker_runtime_config'.") if run_config.script and self.script: logging.warning( "If 'script' has been provided here and a script file name has been specified in " "'run_config', 'script' provided in ScriptRunConfig initialization will take " "precedence.") elif run_config.script and not self.script: self.script = run_config.script if run_config.arguments and self.arguments: logging.warning( "If 'arguments' has been provided here and arguments have been specified in " "'run_config', 'arguments' provided in ScriptRunConfig initialization will " "take precedence.") elif run_config.arguments and not self.arguments: self.arguments = run_config.arguments if run_config.command and self.command: logging.warning( "If 'command' has been provided here and command has been specified in " "'run_config', 'command' provided in ScriptRunConfig initialization will " "take precedence.") elif run_config.command and not self.command: self.command = run_config.command self.run_config = run_config else: self.run_config = RunConfiguration() self.run_config.target = compute_target if compute_target else "local" self.run_config.environment = environment if environment else EnvironmentDefinition( ) self.run_config.max_run_duration_seconds = max_run_duration_seconds if distributed_job_config: if not isinstance(distributed_job_config, (TensorflowConfiguration, MpiConfiguration, PyTorchConfiguration)): raise RunConfigurationException( "'distributed_job_config' must be an " "'azureml.core.runconfig.TensorflowConfiguration', " "'azureml.core.runconfig.MpiConfiguration', or " "'azureml.core.runconfig.PyTorchConfiguration' object." ) self.run_config.node_count = ( distributed_job_config.worker_count if isinstance( distributed_job_config, TensorflowConfiguration) else distributed_job_config.node_count) if isinstance(distributed_job_config, TensorflowConfiguration): self.run_config.tensorflow = distributed_job_config self.run_config.framework = "TensorFlow" self.run_config.communicator = "ParameterServer" elif isinstance(distributed_job_config, MpiConfiguration): self.run_config.mpi = distributed_job_config self.run_config.framework = "Python" self.run_config.communicator = "IntelMpi" elif isinstance(distributed_job_config, PyTorchConfiguration): self.run_config.pytorch = distributed_job_config self.run_config.framework = "PyTorch" self.run_config.communicator = distributed_job_config.communication_backend if docker_runtime_config: if not isinstance(docker_runtime_config, DockerConfiguration): raise RunConfigurationException( "'docker_runtime_config' must be a 'DockerConfiguration' " "object.") self.run_config.docker = docker_runtime_config elif environment and environment.docker: # Docker configuration in run config is higher priority than docker settings # in environment docker section, explicitly assign the setting values here # to keep backward compatibility. self.run_config.docker = DockerConfiguration( use_docker=environment.docker._enabled, shm_size=environment.docker._shm_size, shared_volumes=environment.docker._shared_volumes, arguments=environment.docker._arguments) if resume_from: if not isinstance(resume_from, DataPath): raise UserErrorException( "resume_from parameter should be of type DataPath. " "Found {}.".format(type(resume_from))) outputs_data_reference = resume_from. \ create_data_reference(data_reference_name="MODEL_LOCATION", datapath_compute_binding=DataPathComputeBinding(mode="mount")) if self.arguments is None: self.arguments = [] self.arguments.extend( ["--resume-from", str(outputs_data_reference)]) self.run_config.data_references[outputs_data_reference.data_reference_name] = \ outputs_data_reference.to_config() self._telemetry_values = _telemetry_values
def create_experiment_config(workspace): ######################################## ### Creating data prep Pipeline Step ### ######################################## # Load settings print("Loading settings") data_prep_step_path = os.path.join("steps", "data_prep") with open(os.path.join(data_prep_step_path, "step.json")) as f: data_prep_settings = json.load(f) # Setup datasets - Create PipelineParameter for dynamic pipeline input print("Setting up datasets with dynamic input") data_prep_input_path = DataPath( datastore=Datastore(workspace=workspace, name=data_prep_settings.get( "datastore_input_name", "workspaceblobstore")), path_on_datastore= "golden/Atlantis/PAX1/15-Mar-2020-23-37-50-279971/PAX1.parquet/") data_prep_input_path_pipeline_parameter = PipelineParameter( name="input_path", default_value=data_prep_input_path) data_prep_input = (data_prep_input_path_pipeline_parameter, DataPathComputeBinding(mode="mount")) data_prep_output = PipelineData( name=data_prep_settings.get("dataset_output_name", None), datastore=Datastore(workspace=workspace, name=data_prep_settings.get( "datastore_output_name", "workspaceblobstore")), output_mode="mount").as_dataset() # Uncomment next lines, if you want to register intermediate dataset #data_prep_output.register( # name=data_prep_settings.get("dataset_output_name", None), # create_new_version=True #) # Create conda dependencies print("Creating conda dependencies") data_prep_dependencies = CondaDependencies.create( pip_packages=data_prep_settings.get("pip_packages", []), conda_packages=data_prep_settings.get("conda_packages", []), python_version=data_prep_settings.get("python_version", "3.6.2")) # Create run configuration print("Creating RunConfiguration") data_prep_run_config = RunConfiguration( conda_dependencies=data_prep_dependencies, framework=data_prep_settings.get("framework", "Python")) # Loading compute target print("Loading ComputeTarget") data_prep_compute_target = ComputeTarget(workspace=workspace, name=data_prep_settings.get( "compute_target_name", None)) # Create python step print("Creating Step") data_prep = PythonScriptStep( name=data_prep_settings.get("step_name", None), script_name=data_prep_settings.get("script_name", None), arguments=data_prep_settings.get("arguments", []) + ["--input-datapath", data_prep_input], compute_target=data_prep_compute_target, runconfig=data_prep_run_config, inputs=[data_prep_input], outputs=[data_prep_output], params=data_prep_settings.get("parameters", []), source_directory=data_prep_step_path, allow_reuse=data_prep_settings.get("allow_reuse", True), version=data_prep_settings.get("version", None), ) ############################################ ### Creating inference Parallel Run Step ### ############################################ # Load settings print("Loading settings") batch_inference_step_path = os.path.join("steps", "batch_inference") with open(os.path.join(batch_inference_step_path, "step.json")) as f: batch_inference_settings = json.load(f) # Setup datasets of first step print("Setting up datasets") batch_inference_input = data_prep_output.as_named_input( name=batch_inference_settings.get("dataset_input_name", None)) batch_inference_output = PipelineData( name=batch_inference_settings.get("dataset_output_name", None), datastore=Datastore(workspace=workspace, name=batch_inference_settings.get( "datastore_output_name", None)), output_mode="mount", ).as_dataset() # Uncomment next lines, if you want to register intermediate dataset #batch_inference_output.register( # name=batch_inference_settings.get("dataset_output_name", None), # create_new_version=True #) # Create conda dependencies print("Creating conda dependencies") batch_inference_dependencies = CondaDependencies.create( pip_packages=batch_inference_settings.get("pip_packages", []), conda_packages=batch_inference_settings.get("conda_packages", []), python_version=batch_inference_settings.get("python_version", "3.6.2")) # Create run configuration print("Creating RunConfiguration") data_prep_run_config = RunConfiguration( conda_dependencies=batch_inference_dependencies, framework=batch_inference_settings.get("framework", "Python")) # Loading compute target print("Loading ComputeTarget") batch_inference_compute_target = ComputeTarget( workspace=workspace, name=batch_inference_settings.get("compute_target_name", None)) # Create python step print("Creating Step") batch_inference = PythonScriptStep( name=batch_inference_settings.get("step_name", None), script_name=batch_inference_settings.get("script_name", None), arguments=batch_inference_settings.get("arguments", []), compute_target=batch_inference_compute_target, runconfig=data_prep_run_config, inputs=[batch_inference_input], outputs=[batch_inference_output], params=batch_inference_settings.get("parameters", []), source_directory=batch_inference_step_path, allow_reuse=batch_inference_settings.get("allow_reuse", True), version=batch_inference_settings.get("version", None), ) ######################### ### Creating Pipeline ### ######################### # Create Pipeline print("Creating Pipeline") pipeline = Pipeline( workspace=workspace, steps=[batch_inference], description="Batch Inference Pipeline", ) return pipeline