def main(): try: global workspace global datastore # Parse command line arguments args = parse_args() # Retreive workspace workspace = Workspace.get( subscription_id=args.subscription_id, resource_group=args.resource_group, name=args.workspace_name, ) # Retreive default datastore for testing datastore = workspace.get_default_datastore() # Define directories for input and output test data on datastore input_file_path = f"tests/inputs/{args.build_id}" output_file_path = f"tests/outputs/{args.build_id}" print("Variable [input_file_path]:", input_file_path) print("Variable [output_file_path]:", output_file_path) # Copy data to input directory on datastore for testing copy_data_for_tests(args.dataset_name, input_file_path) # Define pipeline parameters pipeline_parameters = { "build_id": args.build_id, "input_datapath": DataPath(datastore=datastore, path_on_datastore=input_file_path), "output_datapath": DataPath(datastore=datastore, path_on_datastore=output_file_path), } print("Variable [pipeline_parameters]:", pipeline_parameters) # Run pipeline run_pipeline(workspace, args.pipeline_name, pipeline_parameters) # List all files in input and output datasets input_dataset_files = get_dataset_file(input_file_path) output_dataset_files = get_dataset_file(output_file_path) print("Variable [input_dataset_files]:", input_dataset_files) print("Variable [output_dataset_files]:", output_dataset_files) # Should have scored all input files and saved result to output datastore assert len(input_dataset_files) == len(output_dataset_files) except Exception: print(f"Exception: run_pipeline.py\n{traceback.format_exc()}") exit(1)
def main(): try: global args # Parse command line arguments args = parse_args(sys.argv[1:]) # Retreive workspace workspace = Workspace.get( subscription_id=args.subscription_id, resource_group=args.resource_group, name=args.workspace_name, ) if args.pipeline_action == "draft": pipeline = create_pipeline(workspace) draft_pipeline( workspace, pipeline, args.pipeline_name, args.experiment_name, args.build_id, args.pipeline_metadata_file, ) elif args.pipeline_action == "run": # Define pipeline parameters pipeline_parameters = { "build_id": args.build_id, "input_datapath": DataPath( datastore=args.input_datastore_name, path_on_datastore=args.input_datastore_path, ), "output_datapath": DataPath( datastore=args.output_datastore_name, path_on_datastore=args.output_datastore_path, ), } run_pipeline(workspace, args.pipeline_name, pipeline_parameters) elif args.pipeline_action == "publish": publish_pipeline( workspace, args.pipeline_name, args.disable_published_pipelines ) else: raise Exception("Invalid pipeline action:", args.pipeline_action) except Exception: exception = f"Exception: train_pipeline.py\n{traceback.format_exc()}" print(exception) exit(1)
def get_input_dataset(ws: Workspace, ds: Datastore, env: Env) -> Dataset: """ Gets an input dataset wrapped around an input data file. The input data file is assumed to exist in the supplied datastore. :param ws: AML Workspace :param ds: Datastore containing the data file :param env: Environment variables :returns: Input Dataset """ scoringinputds = Dataset.Tabular.from_delimited_files( path=DataPath(ds, env.scoring_datastore_input_filename) ) scoringinputds = scoringinputds.register( ws, name=env.scoring_dataset_name, tags={"purpose": "scoring input", "format": "csv"}, create_new_version=True, ).as_named_input(env.scoring_dataset_name) return scoringinputds
def process_step(datastore: Datastore, compute: ComputeTarget, path_on_datastore: str) -> (PipelineData, EstimatorStep): datapath = DataPath(datastore=datastore, path_on_datastore=path_on_datastore) data_path_pipeline_param = (PipelineParameter(name="data", default_value=datapath), DataPathComputeBinding(mode='mount')) seer_tfrecords = PipelineData("tfrecords_set", datastore=datastore, is_directory=True) prep = Estimator(source_directory='.', compute_target=compute, entry_script='prep.py', pip_requirements_file='requirements.txt') prepStep = EstimatorStep(name='Data Preparation', estimator=prep, estimator_entry_script_arguments=[ "--source_path", data_path_pipeline_param, "--target_path", seer_tfrecords ], inputs=[data_path_pipeline_param], outputs=[seer_tfrecords], compute_target=compute) return seer_tfrecords, prepStep
def load_tabular_partition(self, partition_name: str, datastore_name: str = None, columns: np.array = None, first_row_header: bool = False, cloud_storage: bool = True) -> pd.DataFrame: ''' Loads a partition from a tabular dataset. The implementation will connect to the DataStore and get all delimited files matching the partition_name When configured locally, the implementation will append all files in the datastore_path with name {partition_name}.csv Args: partition_name (str): The name of the partition as a wildcard filter. Example: B* will take all files starting with B, ending with csv columns: (np.array): The column names to assign to the dataframe datastore_path (str): The name of a DataStore that contains Datasets cloud_storage (bool): When changed to False, the dataset will be loaded from the local folder Returns: pd.DataFrame: The dataset, loaded as a DataFrame ''' if not datastore_name: # No datastore name is given, so we'll take the default one datastore_name = self.__datastore_path if cloud_storage: # Connecting data store datastore = Datastore(self.__workspace, name=datastore_name) try: _header = PromoteHeadersBehavior.ALL_FILES_HAVE_SAME_HEADERS if first_row_header else False _aml_dataset = Dataset.Tabular.from_delimited_files( header=_header, path=DataPath(datastore, '/' + partition_name + '.csv')) #, set_column_types=columns _df = _aml_dataset.to_pandas_dataframe() except DatasetValidationError as dsvalex: if 'provided path is not valid' in str(dsvalex): return None else: raise else: # Reading data from sub files in a folder _folder_path = datastore_name _partition_files = glob.glob(_folder_path + '/' + partition_name + '.csv') _record_found = False _df = None for filename in _partition_files: _header = 0 if first_row_header else None df = pd.read_csv(filename, index_col=None, header=_header) if not _record_found: _df = df _record_found = True else: _df = _df.append(df) if not _record_found: return None if columns != None: _df.columns = columns return _df
def get_input_dataset(workspace, datastore, env): scoring_input_ds = Dataset.Tabular.from_delimited_files( path=DataPath(datastore, env.scoring_datastore_input_filename)) scoring_input_ds = scoring_input_ds.register( workspace=workspace, name=env.scoring_dataset_name, tag={ 'purpose': 'for scoring', 'format': 'csv' }, create_new_version=True).as_named_input(env.scoring_dataset_name) return scoring_input_ds
def submit_pipeline( workspace=None, # Auto populated args + object pipeline_id=None, experiment_name=None, pipeline_yaml=None, pipeline_params=None, datapath_params=None, output_file=None, # We enforce a logger logger=None): """ Submit a pipeline run based on a published pipeline ID """ if pipeline_id is None and pipeline_yaml is None: raise UserErrorException("Please specify a pipeline ID or a pipeline YAML file") published_pipeline = None pipeline = None if pipeline_id is not None: from azureml.pipeline.core import PublishedPipeline published_pipeline = PublishedPipeline.get(workspace, pipeline_id) if experiment_name is None or experiment_name == '': # Use the pipeline name as the experiment name experiment_name = published_pipeline._sanitize_name() else: from azureml.pipeline.core import Pipeline pipeline = Pipeline.load_yaml(workspace, pipeline_yaml) if experiment_name is None: raise UserErrorException("Please specify an experiment name") assigned_params = _parse_key_values(pipeline_params, 'Parameter assignment') datapaths = _parse_key_values(datapath_params, 'Datapath assignment') for datapath_param_name in datapaths: datastore_with_path = datapaths[datapath_param_name] if '/' not in datastore_with_path: raise UserErrorException("Datapath value %s should have format datastore/path" % datastore_with_path) path_tokens = datastore_with_path.split('/', 1) from azureml.core import Datastore from azureml.data.datapath import DataPath datastore = Datastore(workspace, path_tokens[0]) assigned_params[datapath_param_name] = DataPath(datastore=datastore, path_on_datastore=path_tokens[1]) dict_output = _pipeline_run_submit(experiment_name, assigned_params, published_pipeline, pipeline, workspace, output_file, logger) return dict_output
def datastore_upload_files(args): """ Get the default datastore and upload files into it """ workspace = package_utils.get_workspace() datastore = package_utils.get_default_datastore(workspace) directory = pathlib.Path(args.dataset_path, args.dataset_name) if not os.path.exists(directory): msg = f"The dataset directory {directory} does not exist" logger.exception(msg) raise RuntimeError(msg) files = [ os.path.abspath(file) for file in sorted(glob.glob(f"{directory}/*.csv")) ] target_path = f"{args.dataset_name}_{args.dataset_version}" kwargs = { "files": files, "target_path": target_path, "overwrite": args.dataset_overwrite, } logger.info(msg="datastore.upload_files", extra={"kwargs": kwargs}) if not args.dry_run: try: _ = upload_files(datastore, **kwargs) except: msg = f"Upload to target_path {target_path} failed" logger.exception(msg) raise RuntimeError(msg) datastore_path = [ DataPath(datastore, str(pathlib.Path(target_path, os.path.basename(file)))) for file in files ] return datastore_path, target_path
def upload(self, folder_to_upload, path_datastore, dataset_name=None): """ Upload files to Azure Blob Storage attached to AzureML Workspace. Args: folder_to_upload: Local folder to be uploaded to the DataStore. path_datastore: Path in the Datastore where files in folder_to_upload will be stored. dataset_name: Name of the Dataset created as a result ot the upload. Returns: Returns a Filedataset of the uploaded folder in Datastore. """ targetPath = DataPath(self.datastore, path_datastore) fileDataset = Dataset.File.upload_directory( folder_to_upload, targetPath) if dataset_name is not None: fileDataset.register(self.workspace, dataset_name) return fileDataset
#### Define Pipeline! #### ########################## # The following will be created and then run: # 1. Pipeline Parameters # 2. Data Process Step # 3. Training Step # 4. Model Registration Step # 5. Pipeline registration # 6. Submit the pipeline for execution ## Pipeline Parameters ## # We need to tell the Pipeline what it needs to learn to see! datapath = DataPath(datastore=datastore, path_on_datastore=datastorepath) data_path_pipeline_param = (PipelineParameter(name="data", default_value=datapath), DataPathComputeBinding(mode='mount')) # Configuration for data prep and training steps # dataprepEnvironment = Environment.from_pip_requirements( 'dataprepenv', 'requirements-dataprepandtraining.txt') dataprepRunConfig = RunConfiguration() dataprepRunConfig.environment = dataprepEnvironment ## Data Process Step ## # parse.py file parses the images in our data source # seer_tfrecords = PipelineData("tfrecords_set",
from azureml.data.datapath import DataPath, DataPathComputeBinding from azureml.train.dnn import TensorFlow clusterName = "NV6AICluster" # Load workspace ws = Workspace.from_config() # Connect to Compute Target computeCluster = ComputeTarget(workspace=ws, name=clusterName) # connect to datastores source_ds = Datastore.get(ws, 'SimpsonDataStore') training_ds = Datastore.get(ws, 'SimpsonTrainingDataStore') source_dataset = DataPath(datastore=source_ds, path_on_datastore="trainingdata") # Parameters make it easy for us to re-run this training pipeline, including for retraining. source_dataset_param = (PipelineParameter(name="source_dataset",default_value=source_dataset), DataPathComputeBinding()) script_folder = "./steps" # == Step 1 == cd = CondaDependencies.create(pip_packages=["azureml-sdk","opencv-python"]) amlcompute_run_config = RunConfiguration(conda_dependencies=cd) training_data_location = PipelineData(name="trainingdata", datastore=training_ds) preProcessDataStep = PythonScriptStep(name="Pre-process data", script_name="prep.py",
TRAIN_DATA_SPLIT = 0.8 NUMBER_ESTIMATORS = 10 TRAIN_FOLDER_NAME = "src/train" TRAIN_FILE_NAME = "train.py" MODELNAME = "script-classifier" SERVICENAME = "script-deployment" MODELFILENAME = "model.pkl" ws = Workspace.from_config() exp = Experiment(ws, "MaxFreezerTemperatureExceeded", _create_in_cloud=True) logger.info("Experiment created") # ACCESS DATA datastore = Datastore.get(ws, "sensordata") datapath = DataPath(datastore=datastore, path_on_datastore="/processed/json/**") dataset = Dataset.Tabular.from_json_lines_files( path=datapath, validate=True, include_path=False, set_column_types={ "allevents": DataType.to_string(), "ConnectionDeviceID": DataType.to_string(), }, partition_format="/{PartitionDate:yyyy/MM/dd}/", ) dataset.register( workspace=ws, name="processed_json", description="Output from Stream Analytics", create_new_version=True,
#########################MODIFY########################### #Get Azure SQL Datastore - CHANGE AZURE SQL DATASTORE NAME azsql_ds = Datastore.get(ws, 'azsql_ds') #UPDATE QUERY STRING HERE query_string = 'SELECT * FROM Filter WHERE D={}'.format(str(query_param)) ########################################################## #Query Azure SQL Datastore filter_sql_query = DataPath(azsql_ds, query_string) filter_sql_ds = Dataset.Tabular.from_sql_query(filter_sql_query, query_timeout=10) #Convert dataset to pandas dataframe filter_df = filter_sql_ds.to_pandas_dataframe() #Write dataframe to output dataset path os.makedirs(filter_dataset, exist_ok=True) filter_df.to_csv(os.path.join(filter_dataset, 'filter_data.csv'), index=False)
# Storage account access key account_key = env.storage_account_key # Verify that the blob store does not exist already try: blob_datastore = Datastore.get(aml_workspace, blob_datastore_name) print('Found existing datastore, use it.') except HttpOperationError: blob_datastore = Datastore.register_azure_blob_container( workspace=aml_workspace, datastore_name=blob_datastore_name, container_name=container_name, account_name=account_name, account_key=account_key) print("Registered blob datastore with name: %s" % blob_datastore_name) # Register dataset without creating new version input_datastore_paths = [DataPath(blob_datastore, env.input_dataset_name)] input_dataset = Dataset.File.from_files(path=input_datastore_paths) input_dataset = input_dataset.register(workspace=aml_workspace, name=env.input_dataset_name, description=env.input_dataset_name) print("Registered dataset: %s" % input_dataset.name) waves_datastore_paths = [DataPath(blob_datastore, env.waves_dataset_name)] waves_dataset = Dataset.File.from_files(path=waves_datastore_paths) waves_dataset = waves_dataset.register(workspace=aml_workspace, name=env.waves_dataset_name, description=env.waves_dataset_name) print("Registered dataset: %s" % waves_dataset.name)
from azureml.core import Dataset, Datastore from azureml.data.datapath import DataPath from azureml.data.dataset_factory import TabularDatasetFactory # Connect to the Azure Machine Learning Workspace azureml_workspace = Workspace.from_config(auth=sp_auth) # Like the DBFS Mount, the Azure ML Datastore references the same `processed` container on Azure Storage processed_ds = Datastore.get(azureml_workspace, 'datastoreprocessed') # Dataset A: a subset of comments in the gaming category. # We will use it to run a quick feasiblity analysis experiment. As well to have a cost-effective way to experiment with changes while we iterate on model versions. comments_subset_gaming_dataset = TabularDatasetFactory.from_parquet_files([ DataPath(processed_ds, path) for path in match_pattern_on_storage( "redditcomments/subreddit=gaming/*.parquet") ]) # Dataset: the full set of comments for scale model training comments_full_dataset = TabularDatasetFactory.from_parquet_files([ DataPath(processed_ds, path) for path in match_pattern_on_storage("redditcomments/*/*.parquet") ]) # Register the data set versions in Azure ML for reference during training comments_full_dataset.register(azureml_workspace, name="redditcomments", create_new_version=True, description="The full dataset of comments")
#### Define Pipeline! #### ########################## # The following will be created and then run: # 1. Pipeline Parameters # 2. Data Process Step # 3. Training Step # 4. Model Registration Step # 5. Pipeline registration # 6. Submit the pipeline for execution ## Pipeline Parameters ## # We need to tell the Pipeline what it needs to learn to see! source_dataset = DataPath(datastore=ds, path_on_datastore="seer") source_dataset_param = (PipelineParameter(name="source_dataset", default_value=source_dataset), DataPathComputeBinding()) # Configuration for data prep and training steps # ## Data Process Step ## # prep.py file versions our data in our data source # # Output location for the pre-proccessed trainings images training_data_location = PipelineData(name="seertrainingdata", datastore=ds) # Create the pre-process step preProcessDataStep = PythonScriptStep(name="Pre-process data",
from azureml.data.datapath import DataPath # Get workspace ws = Workspace( subscription_id=args.subscription_id, resource_group=args.resource_group, workspace_name=args.workspace_name ) files = [ h5_path ] datastore = ws.get_default_datastore() datastore.upload_files( files=files, relative_root=args.outputs_path, target_path=args.outputs_path ) logger.success("Files uploaded to '{}' in the datastore".format(args.outputs_path)) # Create dataset and register it paths = [ DataPath(datastore=datastore, path_on_datastore=h5_path), ] dataset = Dataset.File.from_files(path=paths) ds_name = "voc-classification" dataset.register( workspace=ws, name=ds_name, description="Preprocessed features and labels of Pascal VOC 2012: 0-padding, resizing and features normalization on classification task" ) print("File dataset {} registered".format(ds_name))
########################## # The following will be created and then run: # 1. Pipeline Parameters # 2. Data Process Step # 3. Training Step # 4. Model Registration Step # 5. Pipeline registration # 6. Submit the pipeline for execution ## Pipeline Parameters ## # We need to tell the Pipeline what it needs to learn to see! source_dataset = DataPath( datastore=ds, path_on_datastore="simpsonslego-v3") source_dataset_param = (PipelineParameter(name="source_dataset",default_value=source_dataset), DataPathComputeBinding()) # Configuration for data prep and training steps # ## Data Process Step ## # prep.py file versions our data in our data source # # Output location for the pre-proccessed trainings images training_data_location = PipelineData(name="simpsons_training_data", datastore=ds) # Create the pre-process step preProcessDataStep = PythonScriptStep(
from azureml.core import Environment from azureml.core.conda_dependencies import CondaDependencies from azureml.core import Dataset subscription_id = 'bd04922c-a444-43dc-892f-74d5090f8a9a' resource_group = 'mlplayarearg' workspace_name = 'testdeployment' workspace = Workspace(subscription_id, resource_group, workspace_name) mydatastore = Datastore.get(workspace, 'billingdatablobstorage') from azureml.data.datapath import DataPath, DataPathComputeBinding from azureml.pipeline.core.graph import PipelineParameter data_path = DataPath(datastore=mydatastore, path_on_datastore='rawdata') datapath1_pipeline_param = PipelineParameter(name="input_datapath", default_value=data_path) datapath_input = (datapath1_pipeline_param, DataPathComputeBinding(mode='mount')) string_pipeline_param = PipelineParameter(name="input_string", default_value='sample_string1') compute_config = RunConfiguration() compute_config.target = "cpu-cluster" dependencies = CondaDependencies() dependencies.add_pip_package("adal==0.4.7") compute_config.environment.python.conda_dependencies = dependencies
from azureml.pipeline.core import Pipeline, PipelineData, PipelineParameter from azureml.pipeline.steps import PythonScriptStep workspace = Workspace.from_config() blobstore = workspace.get_default_datastore() environment = Environment.get(workspace, name="AzureML-Scikit-learn-0.20.3") environment.docker.enabled = True run_config = RunConfiguration() run_config.environment = environment compute_target = workspace.compute_targets["cpu"] run_config.target = compute_target train_features_datapath = DataPath( datastore=blobstore, path_on_datastore="training_set_features.csv") train_features_path_parameter = PipelineParameter( name="train_features", default_value=train_features_datapath) train_features_path = (train_features_path_parameter, DataPathComputeBinding(mode="mount")) train_labels_datapath = DataPath(datastore=blobstore, path_on_datastore="training_set_labels.csv") train_labels_path_parameter = PipelineParameter( name="train_labels", default_value=train_labels_datapath) train_labels_path = (train_labels_path_parameter, DataPathComputeBinding(mode="mount")) test_features_datapath = DataPath(datastore=blobstore, path_on_datastore="test_set_features.csv") test_features_path_parameter = PipelineParameter(
#########################MODIFY########################### #Get Azure SQL Datastore - CHANGE AZURE SQL DATASTORE NAME azsql_ds = Datastore.get(ws, 'azsql_ds') #UPDATE QUERY STRING HERE query_string = 'SELECT * FROM Profile' ########################################################## #Query Azure SQL Datastore profile_sql_query = DataPath(azsql_ds, query_string) profile_sql_ds = Dataset.Tabular.from_sql_query(profile_sql_query, query_timeout=10) #Convert dataset to pandas dataframe and return profile_df = profile_sql_ds.to_pandas_dataframe() #Write dataframe to output dataset path os.makedirs(profile_dataset, exist_ok=True) profile_df.to_csv(os.path.join(profile_dataset, 'profile_data.csv'), index=False)
pip_packages=["azureml-defaults", 'tensorflow==1.8.0']) amlcompute_run_config = RunConfiguration(conda_dependencies=cd) # Define our computes data_factory_compute = DataFactoryCompute(ws, data_factory_name) aml_compute = AmlCompute(ws, aml_compute_target) # We explicitly declare the data we're using in this training pipeline source_images = DataReference(datastore=source_ds, data_reference_name="original_images", path_on_datastore=default_dataset) dest_images = DataReference(datastore=ds, data_reference_name="transferred_images", path_on_datastore='training_images') training_dataset = DataPath(datastore=source_ds, path_on_datastore=default_dataset) # Parameters make it easy for us to re-run this training pipeline, including for retraining. model_variant = PipelineParameter(name="model_variant", default_value='sodacans') training_dataset_param = (PipelineParameter(name="training_dataset", default_value=training_dataset), DataPathComputeBinding()) # Copying data into a datastore we manage ensures we can reproduce the model later on. datatransfer = DataTransferStep( name= "Copy training data for improved performance and model reproducibility", source_data_reference=source_images, destination_data_reference=dest_images, compute_target=data_factory_compute)
def create_pipeline(workspace): # Retreive compute cluster compute_target = workspace.compute_targets[args.compute_target] # Setup batch scoring environment from conda dependencies environment = Environment.from_conda_specification( name=args.environment_name, file_path=args.environment_specification ) # Add environment variables environment.environment_variables = { "APPLICATIONINSIGHTS_CONNECTION_STRING": args.ai_connection_string } # Enable docker run environment.docker.enabled = True # Create run config run_config = RunConfiguration() run_config.environment = environment # Retreive input and output datastores input_datastore = Datastore(workspace, args.input_datastore_name) output_datastore = Datastore(workspace, args.output_datastore_name) # Define build id parameter build_id_param = PipelineParameter("build_id", default_value=args.build_id) # Define input datapath parameter input_datapath = DataPath(datastore=input_datastore, path_on_datastore="") input_datapath_param = ( PipelineParameter(name="input_datapath", default_value=input_datapath), DataPathComputeBinding(mode="mount"), ) # Define output datapath parameter output_datapath = DataPath(datastore=output_datastore, path_on_datastore="") output_datapath_param = ( PipelineParameter(name="output_datapath", default_value=output_datapath), DataPathComputeBinding(mode="mount"), ) # Define score step for pipeline score_step = PythonScriptStep( name="score_data", compute_target=compute_target, source_directory="src/score", script_name="score.py", inputs=[input_datapath_param, output_datapath_param], runconfig=run_config, allow_reuse=False, arguments=[ "--build_id", build_id_param, "--input_datapath", input_datapath_param, "--output_datapath", output_datapath_param, ], ) # Define pipeline for batch scoring pipeline = Pipeline(workspace=workspace, steps=[score_step]) return pipeline
def create_experiment_config(workspace): ######################################## ### Creating data prep Pipeline Step ### ######################################## # Load settings print("Loading settings") data_prep_step_path = os.path.join("steps", "data_prep") with open(os.path.join(data_prep_step_path, "step.json")) as f: data_prep_settings = json.load(f) # Setup datasets - Create PipelineParameter for dynamic pipeline input print("Setting up datasets with dynamic input") data_prep_input_path = DataPath( datastore=Datastore(workspace=workspace, name=data_prep_settings.get( "datastore_input_name", "workspaceblobstore")), path_on_datastore= "golden/Atlantis/PAX1/15-Mar-2020-23-37-50-279971/PAX1.parquet/") data_prep_input_path_pipeline_parameter = PipelineParameter( name="input_path", default_value=data_prep_input_path) data_prep_input = (data_prep_input_path_pipeline_parameter, DataPathComputeBinding(mode="mount")) data_prep_output = PipelineData( name=data_prep_settings.get("dataset_output_name", None), datastore=Datastore(workspace=workspace, name=data_prep_settings.get( "datastore_output_name", "workspaceblobstore")), output_mode="mount").as_dataset() # Uncomment next lines, if you want to register intermediate dataset #data_prep_output.register( # name=data_prep_settings.get("dataset_output_name", None), # create_new_version=True #) # Create conda dependencies print("Creating conda dependencies") data_prep_dependencies = CondaDependencies.create( pip_packages=data_prep_settings.get("pip_packages", []), conda_packages=data_prep_settings.get("conda_packages", []), python_version=data_prep_settings.get("python_version", "3.6.2")) # Create run configuration print("Creating RunConfiguration") data_prep_run_config = RunConfiguration( conda_dependencies=data_prep_dependencies, framework=data_prep_settings.get("framework", "Python")) # Loading compute target print("Loading ComputeTarget") data_prep_compute_target = ComputeTarget(workspace=workspace, name=data_prep_settings.get( "compute_target_name", None)) # Create python step print("Creating Step") data_prep = PythonScriptStep( name=data_prep_settings.get("step_name", None), script_name=data_prep_settings.get("script_name", None), arguments=data_prep_settings.get("arguments", []) + ["--input-datapath", data_prep_input], compute_target=data_prep_compute_target, runconfig=data_prep_run_config, inputs=[data_prep_input], outputs=[data_prep_output], params=data_prep_settings.get("parameters", []), source_directory=data_prep_step_path, allow_reuse=data_prep_settings.get("allow_reuse", True), version=data_prep_settings.get("version", None), ) ############################################ ### Creating inference Parallel Run Step ### ############################################ # Load settings print("Loading settings") batch_inference_step_path = os.path.join("steps", "batch_inference") with open(os.path.join(batch_inference_step_path, "step.json")) as f: batch_inference_settings = json.load(f) # Setup datasets of first step print("Setting up datasets") batch_inference_input = data_prep_output.as_named_input( name=batch_inference_settings.get("dataset_input_name", None)) batch_inference_output = PipelineData( name=batch_inference_settings.get("dataset_output_name", None), datastore=Datastore(workspace=workspace, name=batch_inference_settings.get( "datastore_output_name", None)), output_mode="mount", ).as_dataset() # Uncomment next lines, if you want to register intermediate dataset #batch_inference_output.register( # name=batch_inference_settings.get("dataset_output_name", None), # create_new_version=True #) # Create conda dependencies print("Creating conda dependencies") batch_inference_dependencies = CondaDependencies.create( pip_packages=batch_inference_settings.get("pip_packages", []), conda_packages=batch_inference_settings.get("conda_packages", []), python_version=batch_inference_settings.get("python_version", "3.6.2")) # Create run configuration print("Creating RunConfiguration") data_prep_run_config = RunConfiguration( conda_dependencies=batch_inference_dependencies, framework=batch_inference_settings.get("framework", "Python")) # Loading compute target print("Loading ComputeTarget") batch_inference_compute_target = ComputeTarget( workspace=workspace, name=batch_inference_settings.get("compute_target_name", None)) # Create python step print("Creating Step") batch_inference = PythonScriptStep( name=batch_inference_settings.get("step_name", None), script_name=batch_inference_settings.get("script_name", None), arguments=batch_inference_settings.get("arguments", []), compute_target=batch_inference_compute_target, runconfig=data_prep_run_config, inputs=[batch_inference_input], outputs=[batch_inference_output], params=batch_inference_settings.get("parameters", []), source_directory=batch_inference_step_path, allow_reuse=batch_inference_settings.get("allow_reuse", True), version=batch_inference_settings.get("version", None), ) ######################### ### Creating Pipeline ### ######################### # Create Pipeline print("Creating Pipeline") pipeline = Pipeline( workspace=workspace, steps=[batch_inference], description="Batch Inference Pipeline", ) return pipeline