Exemplo n.º 1
0
def main():
    try:
        global workspace
        global datastore

        # Parse command line arguments
        args = parse_args()

        # Retreive workspace
        workspace = Workspace.get(
            subscription_id=args.subscription_id,
            resource_group=args.resource_group,
            name=args.workspace_name,
        )

        # Retreive default datastore for testing
        datastore = workspace.get_default_datastore()

        # Define directories for input and output test data on datastore
        input_file_path = f"tests/inputs/{args.build_id}"
        output_file_path = f"tests/outputs/{args.build_id}"

        print("Variable [input_file_path]:", input_file_path)
        print("Variable [output_file_path]:", output_file_path)

        # Copy data to input directory on datastore for testing
        copy_data_for_tests(args.dataset_name, input_file_path)

        # Define pipeline parameters
        pipeline_parameters = {
            "build_id":
            args.build_id,
            "input_datapath":
            DataPath(datastore=datastore, path_on_datastore=input_file_path),
            "output_datapath":
            DataPath(datastore=datastore, path_on_datastore=output_file_path),
        }

        print("Variable [pipeline_parameters]:", pipeline_parameters)

        # Run pipeline
        run_pipeline(workspace, args.pipeline_name, pipeline_parameters)

        # List all files in input and output datasets
        input_dataset_files = get_dataset_file(input_file_path)
        output_dataset_files = get_dataset_file(output_file_path)

        print("Variable [input_dataset_files]:", input_dataset_files)
        print("Variable [output_dataset_files]:", output_dataset_files)

        # Should have scored all input files and saved result to output datastore
        assert len(input_dataset_files) == len(output_dataset_files)

    except Exception:
        print(f"Exception: run_pipeline.py\n{traceback.format_exc()}")
        exit(1)
def main():
    try:
        global args

        # Parse command line arguments
        args = parse_args(sys.argv[1:])

        # Retreive workspace
        workspace = Workspace.get(
            subscription_id=args.subscription_id,
            resource_group=args.resource_group,
            name=args.workspace_name,
        )

        if args.pipeline_action == "draft":
            pipeline = create_pipeline(workspace)
            draft_pipeline(
                workspace,
                pipeline,
                args.pipeline_name,
                args.experiment_name,
                args.build_id,
                args.pipeline_metadata_file,
            )

        elif args.pipeline_action == "run":
            # Define pipeline parameters
            pipeline_parameters = {
                "build_id": args.build_id,
                "input_datapath": DataPath(
                    datastore=args.input_datastore_name,
                    path_on_datastore=args.input_datastore_path,
                ),
                "output_datapath": DataPath(
                    datastore=args.output_datastore_name,
                    path_on_datastore=args.output_datastore_path,
                ),
            }

            run_pipeline(workspace, args.pipeline_name, pipeline_parameters)

        elif args.pipeline_action == "publish":
            publish_pipeline(
                workspace, args.pipeline_name, args.disable_published_pipelines
            )

        else:
            raise Exception("Invalid pipeline action:", args.pipeline_action)

    except Exception:
        exception = f"Exception: train_pipeline.py\n{traceback.format_exc()}"
        print(exception)
        exit(1)
def get_input_dataset(ws: Workspace, ds: Datastore, env: Env) -> Dataset:
    """
    Gets an input dataset wrapped around an input data file. The input
    data file is assumed to exist in the supplied datastore.


    :param ws: AML Workspace
    :param ds: Datastore containing the data file
    :param env: Environment variables

    :returns: Input Dataset
    """

    scoringinputds = Dataset.Tabular.from_delimited_files(
        path=DataPath(ds, env.scoring_datastore_input_filename)
    )

    scoringinputds = scoringinputds.register(
        ws,
        name=env.scoring_dataset_name,
        tags={"purpose": "scoring input", "format": "csv"},
        create_new_version=True,
    ).as_named_input(env.scoring_dataset_name)

    return scoringinputds
Exemplo n.º 4
0
def process_step(datastore: Datastore, compute: ComputeTarget,
                 path_on_datastore: str) -> (PipelineData, EstimatorStep):
    datapath = DataPath(datastore=datastore,
                        path_on_datastore=path_on_datastore)
    data_path_pipeline_param = (PipelineParameter(name="data",
                                                  default_value=datapath),
                                DataPathComputeBinding(mode='mount'))

    seer_tfrecords = PipelineData("tfrecords_set",
                                  datastore=datastore,
                                  is_directory=True)

    prep = Estimator(source_directory='.',
                     compute_target=compute,
                     entry_script='prep.py',
                     pip_requirements_file='requirements.txt')

    prepStep = EstimatorStep(name='Data Preparation',
                             estimator=prep,
                             estimator_entry_script_arguments=[
                                 "--source_path", data_path_pipeline_param,
                                 "--target_path", seer_tfrecords
                             ],
                             inputs=[data_path_pipeline_param],
                             outputs=[seer_tfrecords],
                             compute_target=compute)

    return seer_tfrecords, prepStep
Exemplo n.º 5
0
    def load_tabular_partition(self,
                               partition_name: str,
                               datastore_name: str = None,
                               columns: np.array = None,
                               first_row_header: bool = False,
                               cloud_storage: bool = True) -> pd.DataFrame:
        '''
        Loads a partition from a tabular dataset. 
            The implementation will connect to the DataStore and get all delimited files matching the partition_name
            When configured locally, the implementation will append all files in the datastore_path with name {partition_name}.csv
        Args:
            partition_name (str): The name of the partition as a wildcard filter.  Example: B* will take all files starting with B, ending with csv
            columns: (np.array): The column names to assign to the dataframe
            datastore_path (str): The name of a DataStore that contains Datasets
            cloud_storage (bool): When changed to False, the dataset will be loaded from the local folder
        Returns:
            pd.DataFrame: The dataset, loaded as a DataFrame
        '''
        if not datastore_name:
            # No datastore name is given, so we'll take the default one
            datastore_name = self.__datastore_path

        if cloud_storage:
            # Connecting data store
            datastore = Datastore(self.__workspace, name=datastore_name)
            try:
                _header = PromoteHeadersBehavior.ALL_FILES_HAVE_SAME_HEADERS if first_row_header else False
                _aml_dataset = Dataset.Tabular.from_delimited_files(
                    header=_header,
                    path=DataPath(datastore, '/' + partition_name +
                                  '.csv'))  #, set_column_types=columns
                _df = _aml_dataset.to_pandas_dataframe()
            except DatasetValidationError as dsvalex:
                if 'provided path is not valid' in str(dsvalex):
                    return None
                else:
                    raise
        else:
            # Reading data from sub files in a folder
            _folder_path = datastore_name
            _partition_files = glob.glob(_folder_path + '/' + partition_name +
                                         '.csv')
            _record_found = False
            _df = None
            for filename in _partition_files:
                _header = 0 if first_row_header else None
                df = pd.read_csv(filename, index_col=None, header=_header)
                if not _record_found:
                    _df = df
                    _record_found = True
                else:
                    _df = _df.append(df)

            if not _record_found:
                return None

        if columns != None:
            _df.columns = columns
        return _df
Exemplo n.º 6
0
def get_input_dataset(workspace, datastore, env):
    scoring_input_ds = Dataset.Tabular.from_delimited_files(
        path=DataPath(datastore, env.scoring_datastore_input_filename))
    scoring_input_ds = scoring_input_ds.register(
        workspace=workspace,
        name=env.scoring_dataset_name,
        tag={
            'purpose': 'for scoring',
            'format': 'csv'
        },
        create_new_version=True).as_named_input(env.scoring_dataset_name)

    return scoring_input_ds
Exemplo n.º 7
0
def submit_pipeline(
        workspace=None,  # Auto populated args + object
        pipeline_id=None,
        experiment_name=None,
        pipeline_yaml=None,
        pipeline_params=None,
        datapath_params=None,
        output_file=None,
        # We enforce a logger
        logger=None):
    """
    Submit a pipeline run based on a published pipeline ID
    """

    if pipeline_id is None and pipeline_yaml is None:
        raise UserErrorException("Please specify a pipeline ID or a pipeline YAML file")

    published_pipeline = None
    pipeline = None

    if pipeline_id is not None:
        from azureml.pipeline.core import PublishedPipeline
        published_pipeline = PublishedPipeline.get(workspace, pipeline_id)
        if experiment_name is None or experiment_name == '':
            # Use the pipeline name as the experiment name
            experiment_name = published_pipeline._sanitize_name()

    else:
        from azureml.pipeline.core import Pipeline
        pipeline = Pipeline.load_yaml(workspace, pipeline_yaml)

    if experiment_name is None:
        raise UserErrorException("Please specify an experiment name")

    assigned_params = _parse_key_values(pipeline_params, 'Parameter assignment')

    datapaths = _parse_key_values(datapath_params, 'Datapath assignment')
    for datapath_param_name in datapaths:
        datastore_with_path = datapaths[datapath_param_name]
        if '/' not in datastore_with_path:
            raise UserErrorException("Datapath value %s should have format datastore/path" % datastore_with_path)
        path_tokens = datastore_with_path.split('/', 1)
        from azureml.core import Datastore
        from azureml.data.datapath import DataPath
        datastore = Datastore(workspace, path_tokens[0])
        assigned_params[datapath_param_name] = DataPath(datastore=datastore, path_on_datastore=path_tokens[1])

    dict_output = _pipeline_run_submit(experiment_name, assigned_params, published_pipeline, pipeline,
                                       workspace, output_file, logger)

    return dict_output
Exemplo n.º 8
0
def datastore_upload_files(args):
    """
    Get the default datastore and upload files into it
    """
    workspace = package_utils.get_workspace()
    datastore = package_utils.get_default_datastore(workspace)

    directory = pathlib.Path(args.dataset_path, args.dataset_name)
    if not os.path.exists(directory):
        msg = f"The dataset directory {directory} does not exist"
        logger.exception(msg)
        raise RuntimeError(msg)

    files = [
        os.path.abspath(file)
        for file in sorted(glob.glob(f"{directory}/*.csv"))
    ]
    target_path = f"{args.dataset_name}_{args.dataset_version}"
    kwargs = {
        "files": files,
        "target_path": target_path,
        "overwrite": args.dataset_overwrite,
    }
    logger.info(msg="datastore.upload_files", extra={"kwargs": kwargs})
    if not args.dry_run:
        try:
            _ = upload_files(datastore, **kwargs)
        except:
            msg = f"Upload to target_path {target_path} failed"
            logger.exception(msg)
            raise RuntimeError(msg)

    datastore_path = [
        DataPath(datastore,
                 str(pathlib.Path(target_path, os.path.basename(file))))
        for file in files
    ]

    return datastore_path, target_path
Exemplo n.º 9
0
    def upload(self, folder_to_upload, path_datastore, dataset_name=None):
        """
        Upload files to Azure Blob Storage attached to AzureML Workspace.

        Args:
            folder_to_upload: Local folder to be uploaded to the DataStore.
            path_datastore: Path in the Datastore where files in
            folder_to_upload will be stored.
            dataset_name: Name of the Dataset created as a result ot the
            upload.

        Returns:
            Returns a Filedataset of the uploaded folder in Datastore.

        """
        targetPath = DataPath(self.datastore, path_datastore)
        fileDataset = Dataset.File.upload_directory(
            folder_to_upload,
            targetPath)
        if dataset_name is not None:
            fileDataset.register(self.workspace, dataset_name)
        return fileDataset
Exemplo n.º 10
0
#### Define Pipeline! ####
##########################

# The following will be created and then run:
# 1. Pipeline Parameters
# 2. Data Process Step
# 3. Training Step
# 4. Model Registration Step
# 5. Pipeline registration
# 6. Submit the pipeline for execution

## Pipeline Parameters ##
# We need to tell the Pipeline what it needs to learn to see!

datapath = DataPath(datastore=datastore, path_on_datastore=datastorepath)
data_path_pipeline_param = (PipelineParameter(name="data",
                                              default_value=datapath),
                            DataPathComputeBinding(mode='mount'))

# Configuration for data prep and training steps #

dataprepEnvironment = Environment.from_pip_requirements(
    'dataprepenv', 'requirements-dataprepandtraining.txt')
dataprepRunConfig = RunConfiguration()
dataprepRunConfig.environment = dataprepEnvironment

## Data Process Step ##
# parse.py file parses the images in our data source #

seer_tfrecords = PipelineData("tfrecords_set",
Exemplo n.º 11
0
from azureml.data.datapath import DataPath, DataPathComputeBinding
from azureml.train.dnn import TensorFlow

clusterName = "NV6AICluster"

# Load workspace
ws = Workspace.from_config()

# Connect to Compute Target
computeCluster = ComputeTarget(workspace=ws, name=clusterName)

# connect to datastores
source_ds = Datastore.get(ws, 'SimpsonDataStore')
training_ds = Datastore.get(ws, 'SimpsonTrainingDataStore')

source_dataset = DataPath(datastore=source_ds, path_on_datastore="trainingdata")

# Parameters make it easy for us to re-run this training pipeline, including for retraining.
source_dataset_param = (PipelineParameter(name="source_dataset",default_value=source_dataset),
                          DataPathComputeBinding())

script_folder = "./steps"

# == Step 1 ==
cd = CondaDependencies.create(pip_packages=["azureml-sdk","opencv-python"])
amlcompute_run_config = RunConfiguration(conda_dependencies=cd)

training_data_location = PipelineData(name="trainingdata", datastore=training_ds)

preProcessDataStep = PythonScriptStep(name="Pre-process data",
                            script_name="prep.py",
Exemplo n.º 12
0
TRAIN_DATA_SPLIT = 0.8
NUMBER_ESTIMATORS = 10
TRAIN_FOLDER_NAME = "src/train"
TRAIN_FILE_NAME = "train.py"
MODELNAME = "script-classifier"
SERVICENAME = "script-deployment"
MODELFILENAME = "model.pkl"

ws = Workspace.from_config()
exp = Experiment(ws, "MaxFreezerTemperatureExceeded", _create_in_cloud=True)
logger.info("Experiment created")

# ACCESS DATA

datastore = Datastore.get(ws, "sensordata")
datapath = DataPath(datastore=datastore,
                    path_on_datastore="/processed/json/**")
dataset = Dataset.Tabular.from_json_lines_files(
    path=datapath,
    validate=True,
    include_path=False,
    set_column_types={
        "allevents": DataType.to_string(),
        "ConnectionDeviceID": DataType.to_string(),
    },
    partition_format="/{PartitionDate:yyyy/MM/dd}/",
)
dataset.register(
    workspace=ws,
    name="processed_json",
    description="Output from Stream Analytics",
    create_new_version=True,

#########################MODIFY###########################



#Get Azure SQL Datastore - CHANGE AZURE SQL DATASTORE NAME
azsql_ds = Datastore.get(ws, 'azsql_ds')

#UPDATE QUERY STRING HERE 
query_string = 'SELECT * FROM Filter WHERE D={}'.format(str(query_param))



##########################################################





#Query Azure SQL Datastore
filter_sql_query = DataPath(azsql_ds, query_string)
filter_sql_ds = Dataset.Tabular.from_sql_query(filter_sql_query, query_timeout=10)

#Convert dataset to pandas dataframe
filter_df = filter_sql_ds.to_pandas_dataframe()

#Write dataframe to output dataset path
os.makedirs(filter_dataset, exist_ok=True)
filter_df.to_csv(os.path.join(filter_dataset, 'filter_data.csv'), index=False)
Exemplo n.º 14
0
# Storage account access key
account_key = env.storage_account_key

# Verify that the blob store does not exist already
try:
    blob_datastore = Datastore.get(aml_workspace, blob_datastore_name)
    print('Found existing datastore, use it.')
except HttpOperationError:
    blob_datastore = Datastore.register_azure_blob_container(
        workspace=aml_workspace,
        datastore_name=blob_datastore_name,
        container_name=container_name,
        account_name=account_name,
        account_key=account_key)
    print("Registered blob datastore with name: %s" % blob_datastore_name)

# Register dataset without creating new version
input_datastore_paths = [DataPath(blob_datastore, env.input_dataset_name)]
input_dataset = Dataset.File.from_files(path=input_datastore_paths)
input_dataset = input_dataset.register(workspace=aml_workspace,
                                       name=env.input_dataset_name,
                                       description=env.input_dataset_name)
print("Registered dataset: %s" % input_dataset.name)

waves_datastore_paths = [DataPath(blob_datastore, env.waves_dataset_name)]
waves_dataset = Dataset.File.from_files(path=waves_datastore_paths)
waves_dataset = waves_dataset.register(workspace=aml_workspace,
                                       name=env.waves_dataset_name,
                                       description=env.waves_dataset_name)
print("Registered dataset: %s" % waves_dataset.name)
from azureml.core import Dataset, Datastore
from azureml.data.datapath import DataPath
from azureml.data.dataset_factory import TabularDatasetFactory

# Connect to the Azure Machine Learning Workspace
azureml_workspace = Workspace.from_config(auth=sp_auth)

# Like the DBFS Mount, the Azure ML Datastore references the same `processed` container on Azure Storage
processed_ds = Datastore.get(azureml_workspace, 'datastoreprocessed')

# Dataset A: a subset of comments in the gaming category.

# We will use it to run a quick feasiblity analysis experiment. As well to have a cost-effective way to experiment with changes while we iterate on model versions.

comments_subset_gaming_dataset = TabularDatasetFactory.from_parquet_files([
    DataPath(processed_ds, path) for path in match_pattern_on_storage(
        "redditcomments/subreddit=gaming/*.parquet")
])

# Dataset: the full set of comments for scale model training

comments_full_dataset = TabularDatasetFactory.from_parquet_files([
    DataPath(processed_ds, path)
    for path in match_pattern_on_storage("redditcomments/*/*.parquet")
])

# Register the data set versions in Azure ML for reference during training
comments_full_dataset.register(azureml_workspace,
                               name="redditcomments",
                               create_new_version=True,
                               description="The full dataset of comments")
Exemplo n.º 16
0
#### Define Pipeline! ####
##########################

# The following will be created and then run:
# 1. Pipeline Parameters
# 2. Data Process Step
# 3. Training Step
# 4. Model Registration Step
# 5. Pipeline registration
# 6. Submit the pipeline for execution

## Pipeline Parameters ##
# We need to tell the Pipeline what it needs to learn to see!

source_dataset = DataPath(datastore=ds, path_on_datastore="seer")

source_dataset_param = (PipelineParameter(name="source_dataset",
                                          default_value=source_dataset),
                        DataPathComputeBinding())

# Configuration for data prep and training steps #

## Data Process Step ##
# prep.py file versions our data in our data source #

# Output location for the pre-proccessed trainings images
training_data_location = PipelineData(name="seertrainingdata", datastore=ds)

# Create the pre-process step
preProcessDataStep = PythonScriptStep(name="Pre-process data",
Exemplo n.º 17
0
    from azureml.data.datapath import DataPath
    # Get workspace
    ws = Workspace(
        subscription_id=args.subscription_id,
        resource_group=args.resource_group,
        workspace_name=args.workspace_name
    )
    files = [
        h5_path
    ]
    datastore = ws.get_default_datastore()
    datastore.upload_files(
        files=files,
        relative_root=args.outputs_path,
        target_path=args.outputs_path
    )
    logger.success("Files uploaded to '{}' in the datastore".format(args.outputs_path))

    # Create dataset and register it
    paths = [
        DataPath(datastore=datastore, path_on_datastore=h5_path),
    ]
    dataset = Dataset.File.from_files(path=paths)

    ds_name = "voc-classification"
    dataset.register(
        workspace=ws,
        name=ds_name,
        description="Preprocessed features and labels of Pascal VOC 2012: 0-padding, resizing and features normalization on classification task"
    )
    print("File dataset {} registered".format(ds_name))
Exemplo n.º 18
0
##########################

# The following will be created and then run:
# 1. Pipeline Parameters
# 2. Data Process Step
# 3. Training Step
# 4. Model Registration Step
# 5. Pipeline registration
# 6. Submit the pipeline for execution


## Pipeline Parameters ##
# We need to tell the Pipeline what it needs to learn to see!

source_dataset = DataPath(
    datastore=ds, 
    path_on_datastore="simpsonslego-v3")

source_dataset_param = (PipelineParameter(name="source_dataset",default_value=source_dataset),
                          DataPathComputeBinding())

# Configuration for data prep and training steps #

## Data Process Step ##
# prep.py file versions our data in our data source #

# Output location for the pre-proccessed trainings images
training_data_location = PipelineData(name="simpsons_training_data", datastore=ds)

# Create the pre-process step
preProcessDataStep = PythonScriptStep(
Exemplo n.º 19
0
from azureml.core import Environment
from azureml.core.conda_dependencies import CondaDependencies
from azureml.core import Dataset

subscription_id = 'bd04922c-a444-43dc-892f-74d5090f8a9a'
resource_group = 'mlplayarearg'
workspace_name = 'testdeployment'

workspace = Workspace(subscription_id, resource_group, workspace_name)

mydatastore = Datastore.get(workspace, 'billingdatablobstorage')

from azureml.data.datapath import DataPath, DataPathComputeBinding
from azureml.pipeline.core.graph import PipelineParameter

data_path = DataPath(datastore=mydatastore, path_on_datastore='rawdata')
datapath1_pipeline_param = PipelineParameter(name="input_datapath",
                                             default_value=data_path)
datapath_input = (datapath1_pipeline_param,
                  DataPathComputeBinding(mode='mount'))

string_pipeline_param = PipelineParameter(name="input_string",
                                          default_value='sample_string1')

compute_config = RunConfiguration()
compute_config.target = "cpu-cluster"

dependencies = CondaDependencies()
dependencies.add_pip_package("adal==0.4.7")
compute_config.environment.python.conda_dependencies = dependencies
from azureml.pipeline.core import Pipeline, PipelineData, PipelineParameter
from azureml.pipeline.steps import PythonScriptStep

workspace = Workspace.from_config()
blobstore = workspace.get_default_datastore()

environment = Environment.get(workspace, name="AzureML-Scikit-learn-0.20.3")
environment.docker.enabled = True

run_config = RunConfiguration()
run_config.environment = environment

compute_target = workspace.compute_targets["cpu"]
run_config.target = compute_target

train_features_datapath = DataPath(
    datastore=blobstore, path_on_datastore="training_set_features.csv")
train_features_path_parameter = PipelineParameter(
    name="train_features", default_value=train_features_datapath)
train_features_path = (train_features_path_parameter,
                       DataPathComputeBinding(mode="mount"))

train_labels_datapath = DataPath(datastore=blobstore,
                                 path_on_datastore="training_set_labels.csv")
train_labels_path_parameter = PipelineParameter(
    name="train_labels", default_value=train_labels_datapath)
train_labels_path = (train_labels_path_parameter,
                     DataPathComputeBinding(mode="mount"))

test_features_datapath = DataPath(datastore=blobstore,
                                  path_on_datastore="test_set_features.csv")
test_features_path_parameter = PipelineParameter(




#########################MODIFY###########################


#Get Azure SQL Datastore - CHANGE AZURE SQL DATASTORE NAME
azsql_ds = Datastore.get(ws, 'azsql_ds')

#UPDATE QUERY STRING HERE
query_string = 'SELECT * FROM Profile'


##########################################################





#Query Azure SQL Datastore
profile_sql_query = DataPath(azsql_ds, query_string)
profile_sql_ds = Dataset.Tabular.from_sql_query(profile_sql_query, query_timeout=10)

#Convert dataset to pandas dataframe and return
profile_df = profile_sql_ds.to_pandas_dataframe()

#Write dataframe to output dataset path
os.makedirs(profile_dataset, exist_ok=True)
profile_df.to_csv(os.path.join(profile_dataset, 'profile_data.csv'), index=False)
Exemplo n.º 22
0
    pip_packages=["azureml-defaults", 'tensorflow==1.8.0'])
amlcompute_run_config = RunConfiguration(conda_dependencies=cd)

# Define our computes
data_factory_compute = DataFactoryCompute(ws, data_factory_name)
aml_compute = AmlCompute(ws, aml_compute_target)

# We explicitly declare the data we're using in this training pipeline
source_images = DataReference(datastore=source_ds,
                              data_reference_name="original_images",
                              path_on_datastore=default_dataset)
dest_images = DataReference(datastore=ds,
                            data_reference_name="transferred_images",
                            path_on_datastore='training_images')

training_dataset = DataPath(datastore=source_ds,
                            path_on_datastore=default_dataset)

# Parameters make it easy for us to re-run this training pipeline, including for retraining.
model_variant = PipelineParameter(name="model_variant",
                                  default_value='sodacans')
training_dataset_param = (PipelineParameter(name="training_dataset",
                                            default_value=training_dataset),
                          DataPathComputeBinding())

# Copying data into a datastore we manage ensures we can reproduce the model later on.
datatransfer = DataTransferStep(
    name=
    "Copy training data for improved performance and model reproducibility",
    source_data_reference=source_images,
    destination_data_reference=dest_images,
    compute_target=data_factory_compute)
def create_pipeline(workspace):
    # Retreive compute cluster
    compute_target = workspace.compute_targets[args.compute_target]

    # Setup batch scoring environment from conda dependencies
    environment = Environment.from_conda_specification(
        name=args.environment_name, file_path=args.environment_specification
    )

    # Add environment variables
    environment.environment_variables = {
        "APPLICATIONINSIGHTS_CONNECTION_STRING": args.ai_connection_string
    }

    # Enable docker run
    environment.docker.enabled = True

    # Create run config
    run_config = RunConfiguration()
    run_config.environment = environment

    # Retreive input and output datastores
    input_datastore = Datastore(workspace, args.input_datastore_name)
    output_datastore = Datastore(workspace, args.output_datastore_name)

    # Define build id parameter
    build_id_param = PipelineParameter("build_id", default_value=args.build_id)

    # Define input datapath parameter
    input_datapath = DataPath(datastore=input_datastore, path_on_datastore="")
    input_datapath_param = (
        PipelineParameter(name="input_datapath", default_value=input_datapath),
        DataPathComputeBinding(mode="mount"),
    )

    # Define output datapath parameter
    output_datapath = DataPath(datastore=output_datastore, path_on_datastore="")
    output_datapath_param = (
        PipelineParameter(name="output_datapath", default_value=output_datapath),
        DataPathComputeBinding(mode="mount"),
    )

    # Define score step for pipeline
    score_step = PythonScriptStep(
        name="score_data",
        compute_target=compute_target,
        source_directory="src/score",
        script_name="score.py",
        inputs=[input_datapath_param, output_datapath_param],
        runconfig=run_config,
        allow_reuse=False,
        arguments=[
            "--build_id",
            build_id_param,
            "--input_datapath",
            input_datapath_param,
            "--output_datapath",
            output_datapath_param,
        ],
    )

    # Define pipeline for batch scoring
    pipeline = Pipeline(workspace=workspace, steps=[score_step])

    return pipeline
Exemplo n.º 24
0
def create_experiment_config(workspace):
    ########################################
    ### Creating data prep Pipeline Step ###
    ########################################

    # Load settings
    print("Loading settings")
    data_prep_step_path = os.path.join("steps", "data_prep")
    with open(os.path.join(data_prep_step_path, "step.json")) as f:
        data_prep_settings = json.load(f)

    # Setup datasets - Create PipelineParameter for dynamic pipeline input
    print("Setting up datasets with dynamic input")
    data_prep_input_path = DataPath(
        datastore=Datastore(workspace=workspace,
                            name=data_prep_settings.get(
                                "datastore_input_name", "workspaceblobstore")),
        path_on_datastore=
        "golden/Atlantis/PAX1/15-Mar-2020-23-37-50-279971/PAX1.parquet/")
    data_prep_input_path_pipeline_parameter = PipelineParameter(
        name="input_path", default_value=data_prep_input_path)
    data_prep_input = (data_prep_input_path_pipeline_parameter,
                       DataPathComputeBinding(mode="mount"))
    data_prep_output = PipelineData(
        name=data_prep_settings.get("dataset_output_name", None),
        datastore=Datastore(workspace=workspace,
                            name=data_prep_settings.get(
                                "datastore_output_name",
                                "workspaceblobstore")),
        output_mode="mount").as_dataset()
    # Uncomment next lines, if you want to register intermediate dataset
    #data_prep_output.register(
    #    name=data_prep_settings.get("dataset_output_name", None),
    #    create_new_version=True
    #)

    # Create conda dependencies
    print("Creating conda dependencies")
    data_prep_dependencies = CondaDependencies.create(
        pip_packages=data_prep_settings.get("pip_packages", []),
        conda_packages=data_prep_settings.get("conda_packages", []),
        python_version=data_prep_settings.get("python_version", "3.6.2"))

    # Create run configuration
    print("Creating RunConfiguration")
    data_prep_run_config = RunConfiguration(
        conda_dependencies=data_prep_dependencies,
        framework=data_prep_settings.get("framework", "Python"))

    # Loading compute target
    print("Loading ComputeTarget")
    data_prep_compute_target = ComputeTarget(workspace=workspace,
                                             name=data_prep_settings.get(
                                                 "compute_target_name", None))

    # Create python step
    print("Creating Step")
    data_prep = PythonScriptStep(
        name=data_prep_settings.get("step_name", None),
        script_name=data_prep_settings.get("script_name", None),
        arguments=data_prep_settings.get("arguments", []) +
        ["--input-datapath", data_prep_input],
        compute_target=data_prep_compute_target,
        runconfig=data_prep_run_config,
        inputs=[data_prep_input],
        outputs=[data_prep_output],
        params=data_prep_settings.get("parameters", []),
        source_directory=data_prep_step_path,
        allow_reuse=data_prep_settings.get("allow_reuse", True),
        version=data_prep_settings.get("version", None),
    )

    ############################################
    ### Creating inference Parallel Run Step ###
    ############################################

    # Load settings
    print("Loading settings")
    batch_inference_step_path = os.path.join("steps", "batch_inference")
    with open(os.path.join(batch_inference_step_path, "step.json")) as f:
        batch_inference_settings = json.load(f)

    # Setup datasets of first step
    print("Setting up datasets")
    batch_inference_input = data_prep_output.as_named_input(
        name=batch_inference_settings.get("dataset_input_name", None))
    batch_inference_output = PipelineData(
        name=batch_inference_settings.get("dataset_output_name", None),
        datastore=Datastore(workspace=workspace,
                            name=batch_inference_settings.get(
                                "datastore_output_name", None)),
        output_mode="mount",
    ).as_dataset()
    # Uncomment next lines, if you want to register intermediate dataset
    #batch_inference_output.register(
    #    name=batch_inference_settings.get("dataset_output_name", None),
    #    create_new_version=True
    #)

    # Create conda dependencies
    print("Creating conda dependencies")
    batch_inference_dependencies = CondaDependencies.create(
        pip_packages=batch_inference_settings.get("pip_packages", []),
        conda_packages=batch_inference_settings.get("conda_packages", []),
        python_version=batch_inference_settings.get("python_version", "3.6.2"))

    # Create run configuration
    print("Creating RunConfiguration")
    data_prep_run_config = RunConfiguration(
        conda_dependencies=batch_inference_dependencies,
        framework=batch_inference_settings.get("framework", "Python"))

    # Loading compute target
    print("Loading ComputeTarget")
    batch_inference_compute_target = ComputeTarget(
        workspace=workspace,
        name=batch_inference_settings.get("compute_target_name", None))

    # Create python step
    print("Creating Step")
    batch_inference = PythonScriptStep(
        name=batch_inference_settings.get("step_name", None),
        script_name=batch_inference_settings.get("script_name", None),
        arguments=batch_inference_settings.get("arguments", []),
        compute_target=batch_inference_compute_target,
        runconfig=data_prep_run_config,
        inputs=[batch_inference_input],
        outputs=[batch_inference_output],
        params=batch_inference_settings.get("parameters", []),
        source_directory=batch_inference_step_path,
        allow_reuse=batch_inference_settings.get("allow_reuse", True),
        version=batch_inference_settings.get("version", None),
    )

    #########################
    ### Creating Pipeline ###
    #########################

    # Create Pipeline
    print("Creating Pipeline")
    pipeline = Pipeline(
        workspace=workspace,
        steps=[batch_inference],
        description="Batch Inference Pipeline",
    )

    return pipeline