示例#1
0
def create_register_model_step(model_folder: PipelineData,
                               register_model_folder: PipelineData,
                               compute: ComputeTarget,
                               debug_run: bool) -> PythonScriptStep:
    """
    Creates "Register Model" PythonScriptStep
    """
    force_param = PipelineParameter(name="force_registration",
                                    default_value="False")
    skip_param = PipelineParameter(name="skip_registration",
                                   default_value="False")

    register_step = PythonScriptStep(
        name="Register Model",
        script_name="register_model.py",
        source_directory='./safe-driver/register/',
        compute_target=compute,
        inputs=[model_folder, register_model_folder],
        arguments=[
            "--force", force_param, "--skip", skip_param, "--model-metadata",
            model_folder.as_mount(), "--register-model-folder",
            register_model_folder.as_mount()
        ],
        allow_reuse=debug_run,
        runconfig=RC)

    return register_step
示例#2
0
def create_train_model_step(
        input_data: PipelineData, compute: ComputeTarget,
        debug_run: bool) -> Tuple[PythonScriptStep, PipelineData]:
    output_folder = "./outputs"
    output_data = PipelineData(name="ModelMetadata",
                               datastore=WS.get_default_datastore(),
                               is_directory=True,
                               output_path_on_compute=output_folder,
                               output_mode="upload")

    train_step = PythonScriptStep(name="Train Model",
                                  script_name="train.py",
                                  source_directory='./safe-driver/train/',
                                  compute_target=compute,
                                  inputs=[input_data],
                                  outputs=[output_data],
                                  allow_reuse=debug_run,
                                  arguments=[
                                      "--output-folder", output_folder,
                                      "--training-data",
                                      input_data.as_mount()
                                  ],
                                  runconfig=RC)

    return train_step, output_data
示例#3
0
def create_evaluate_model_step(
        model_metadata_folder: PipelineData, compute: ComputeTarget,
        validation_data: Dataset,
        debug_run: bool) -> Tuple[PythonScriptStep, PipelineData]:
    """
    Creates "Evaluate Model" Step
    """
    output_folder = "./outputs"
    output_data = PipelineData(name="RegisterModel",
                               datastore=WS.get_default_datastore(),
                               is_directory=True,
                               output_path_on_compute=output_folder,
                               output_mode="upload")

    eval_step = PythonScriptStep(
        name="Evaluate Model",
        script_name="evaluate.py",
        source_directory='./safe-driver/evaluate/',
        compute_target=compute,
        inputs=[model_metadata_folder],
        outputs=[output_data],
        arguments=[
            "--model-metadata",
            model_metadata_folder.as_mount(), "--register-model-folder",
            output_folder, "--validation-data",
            validation_data.as_named_input("ValidationData").as_mount()
        ],
        allow_reuse=debug_run,
        runconfig=RC)

    return eval_step, output_data
def get_output_location(
    ws: Workspace, env: Env, outputdatastore: Datastore = None
) -> PipelineData:
    """
    Returns a Datastore wrapped as a PipelineData instance suitable
    for passing into a pipeline step. Represents the location where
    the scoring output should be written. Uses the default workspace
    blob store if no output datastore is supplied.


    :param ws: AML Workspace
    :param env: Environment Variables
    :param outputdatastore: AML Datastore, optional, default is None

    :returns: PipelineData wrapping the output datastore
    """

    if outputdatastore is None:
        output_loc = PipelineData(
            name="defaultoutput", datastore=ws.get_default_datastore()
        )
    else:
        output_loc = PipelineData(
            name=outputdatastore.name, datastore=outputdatastore
        )  # NOQA: E501

    return output_loc
示例#5
0
def get_output_location(workspace, output_datastore=None):
    if output_datastore is not None:
        output_loc = PipelineData(name=output_datastore.name,
                                  datastore=output_datastore)
    else:
        output_loc = PipelineData(name='output_loc',
                                  datastore=workspace.get_default_datastore())

    return output_loc
def data_ingestion_step(datastore_reference, compute_target):
    run_config = RunConfiguration()
    run_config.environment.docker.enabled = True

    raw_data_dir = PipelineData(
        name='raw_data_dir', 
        pipeline_output_name='raw_data_dir',
        datastore=datastore_reference.datastore,
        output_mode='mount',
        is_directory=True)

    outputs = [raw_data_dir]
    outputs_map = { 'raw_data_dir': raw_data_dir }

    step = PythonScriptStep(
        script_name='data_ingestion.py',
        arguments=['--output_dir', raw_data_dir, ],
        inputs=[datastore_reference],
        outputs=outputs,
        compute_target=compute_target,
        source_directory=os.path.dirname(os.path.abspath(__file__)),
        runconfig=run_config,
        allow_reuse=True
    )

    return step, outputs_map
def train_step(train_dir, compute_target):

    max_depth = PipelineParameter(name='max_depth', default_value=5)
    n_estimators = PipelineParameter(name='n_estimators', default_value=500)

    model_dir = PipelineData(name='model_dir',
                             pipeline_output_name='model_dir',
                             datastore=train_dir.datastore,
                             output_mode='mount',
                             is_directory=True)

    outputs = [model_dir]
    outputs_map = {'model_dir': model_dir}

    estimator = SKLearn(source_directory=os.path.dirname(
        os.path.abspath(__file__)),
                        entry_script='train.py',
                        compute_target=compute_target)

    step = EstimatorStep(estimator=estimator,
                         estimator_entry_script_arguments=[
                             '--train_dir', train_dir, '--output_dir',
                             model_dir, '--max_depth', max_depth,
                             '--n_estimators', n_estimators
                         ],
                         inputs=[train_dir],
                         compute_target=compute_target,
                         outputs=outputs,
                         allow_reuse=False)

    return step, outputs_map
def deploy_step(model_dir, accuracy_file, test_dir, compute_target):   

    scoring_url = PipelineData(
        name='scoring_url', 
        pipeline_output_name='scoring_url',
        datastore=accuracy_file.datastore,
        output_mode='mount',
        is_directory=False)

    outputs = [scoring_url]
    outputs_map = { 'scoring_url': scoring_url }

    step = PythonScriptStep(
        script_name='deploy.py',
        arguments=[
            '--model_dir', model_dir, 
            '--accuracy_file', accuracy_file, 
            '--test_dir', test_dir, 
            '--scoring_url', scoring_url
        ],
        inputs=[model_dir, accuracy_file, test_dir],
        outputs=outputs,
        compute_target=compute_target,
        source_directory=os.path.dirname(os.path.abspath(__file__)),
        allow_reuse=False
    )

    return step, outputs_map
def evaluate_step(model_dir, test_dir, compute_target):

    accuracy_file = PipelineData(name='accuracy_file',
                                 pipeline_output_name='accuracy_file',
                                 datastore=test_dir.datastore,
                                 output_mode='mount',
                                 is_directory=False)

    outputs = [accuracy_file]
    outputs_map = {'accuracy_file': accuracy_file}

    estimator = SKLearn(source_directory=os.path.dirname(
        os.path.abspath(__file__)),
                        entry_script='evaluate.py',
                        compute_target=compute_target)

    step = EstimatorStep(estimator=estimator,
                         estimator_entry_script_arguments=[
                             '--test_dir', test_dir, '--model_dir', model_dir,
                             '--accuracy_file', accuracy_file
                         ],
                         inputs=[model_dir, test_dir],
                         outputs=outputs,
                         compute_target=compute_target,
                         allow_reuse=True)

    return step, outputs_map
    def _setup_datastore(self, blob_dataset_name, output_path=None):
        """
        sets up the datastore in azureml. Either retrieves a pre-existing datastore
        or registers a new one in the workspace.

        :param str blob_dataset_name: [required] name of the datastore registered with the
                                 workspace. If the datastore does not yet exist, the
                                 name it will be registered under.
        :param str output_path: [optional] if registering a datastore for inferencing,
                                the output path for writing back predictions.
        """
        try:
            self.blob_ds = Datastore.get(self.ws, blob_dataset_name)
            print("Found Blob Datastore with name: %s" % blob_dataset_name)
        except HttpOperationError:
            self.blob_ds = Datastore.register_azure_blob_container(
                workspace=self.ws,
                datastore_name=blob_dataset_name,
                account_name=self.account_name,
                container_name=self.container_name,
                account_key=self.account_key,
                subscription_id=self.blob_sub_id,
            )

            print("Registered blob datastore with name: %s" %
                  blob_dataset_name)
        if output_path is not None:
            self.output_dir = PipelineData(
                name="output",
                datastore=self.ws.get_default_datastore(),
                output_path_on_compute=output_path)
示例#11
0
def process_step(datastore: Datastore, compute: ComputeTarget,
                 path_on_datastore: str) -> (PipelineData, EstimatorStep):
    datapath = DataPath(datastore=datastore,
                        path_on_datastore=path_on_datastore)
    data_path_pipeline_param = (PipelineParameter(name="data",
                                                  default_value=datapath),
                                DataPathComputeBinding(mode='mount'))

    seer_tfrecords = PipelineData("tfrecords_set",
                                  datastore=datastore,
                                  is_directory=True)

    prep = Estimator(source_directory='.',
                     compute_target=compute,
                     entry_script='prep.py',
                     pip_requirements_file='requirements.txt')

    prepStep = EstimatorStep(name='Data Preparation',
                             estimator=prep,
                             estimator_entry_script_arguments=[
                                 "--source_path", data_path_pipeline_param,
                                 "--target_path", seer_tfrecords
                             ],
                             inputs=[data_path_pipeline_param],
                             outputs=[seer_tfrecords],
                             compute_target=compute)

    return seer_tfrecords, prepStep
示例#12
0
def create_databricks_step(
        input_dataset: Dataset, compute: ComputeTarget,
        debug_run: bool) -> Tuple[DatabricksStep, PipelineData]:
    output_data = PipelineData(name="ParquetFiles",
                               datastore=WS.get_default_datastore(),
                               is_directory=True)

    node_size = 'Standard_DS4_v2'
    spark_version = '7.3.x-cpu-ml-scala2.12'

    db_step = DatabricksStep(
        name='Convert to Parquet',
        inputs=[input_dataset.as_named_input("CSVFiles")],
        outputs=[output_data],
        source_directory="./safe-driver/prep_data",
        python_script_name='prep_data.py',
        python_script_params=["--number-of-files",
                              "1"],  # Set the number of output files to 1
        num_workers=1,
        compute_target=compute,
        pypi_libraries=[],
        allow_reuse=debug_run,
        node_type=node_size,
        spark_version=spark_version,
    )

    return db_step, output_data
    def prepare(self):
        def _regular_name(port_name):
            # AML Service does not allow name with spaces. Replace them with underscore.
            return '_'.join(port_name.split())

        if not self.prepared:
            conn = PipelineData(_regular_name(self.name))
            self.value = conn
示例#14
0
    def _get_data_references(self, request_id, internal_datastore):
        print(
            'AMLCompute, _get_data_references() called. Request ID: {}'.format(
                request_id))
        # Argument Datastore Name needs to: only contain alphanumeric characters and _.
        request_id_to_use_for_datastore = request_id.replace('-', '_')
        try:
            # setting the overwrite flag to True overwrites any datastore that was created previously with that name

            # internal_datastore stores all user-facing files: list of images, detection results, list of failed images
            # and it so happens that each job also needs the list of images as an input
            internal_datastore_name = 'internal_datastore_{}'.format(
                request_id_to_use_for_datastore)
            internal_account_name = internal_datastore['account_name']
            internal_account_key = internal_datastore['account_key']
            internal_container_name = internal_datastore['container_name']
            internal_datastore = Datastore.register_azure_blob_container(
                self.ws,
                internal_datastore_name,
                internal_container_name,
                internal_account_name,
                account_key=internal_account_key)
            print('internal_datastore done')

            # output_datastore stores the output from score.py in each job, which is another container
            # in the same storage account as interl_datastore
            output_datastore_name = 'output_datastore_{}'.format(
                request_id_to_use_for_datastore)
            output_container_name = api_config.AML_CONTAINER
            output_datastore = Datastore.register_azure_blob_container(
                self.ws,
                output_datastore_name,
                output_container_name,
                internal_account_name,
                account_key=internal_account_key)
            print('output_datastore done')

        except Exception as e:
            raise RuntimeError(
                'Error in connecting to the datastores for AML Compute: {}'.
                format(str(e)))

        try:
            internal_dir = DataReference(datastore=internal_datastore,
                                         data_reference_name='internal_dir',
                                         mode='mount')

            output_dir = PipelineData(
                'output_{}'.format(request_id_to_use_for_datastore),
                datastore=output_datastore,
                output_mode='mount')
            print('Finished setting up the Data References.')
        except Exception as e:
            raise RuntimeError(
                'Error in creating data references for AML Compute: {}.'.
                format(str(e)))

        return internal_dir, output_dir
示例#15
0
def train_step(train_dir, valid_dir, compute_target):
    '''
    This step will fine-tune a RESNET-18 model on our dataset using PyTorch. 
    It will use the corresponding input image directories as training and validation data.

    :param train_dir: The reference to the directory containing the training data
    :type train_dir: DataReference
    :param valid_dir: The reference to the directory containing the validation data
    :type valid_dir: DataReference
    :param compute_target: The compute target to run the step on
    :type compute_target: ComputeTarget
    
    :return: The training step, step outputs dictionary (keys: model_dir)
    :rtype: EstimatorStep, dict
    '''

    num_epochs = PipelineParameter(name='num_epochs', default_value=25)
    batch_size = PipelineParameter(name='batch_size', default_value=16)
    learning_rate = PipelineParameter(name='learning_rate', default_value=0.001)
    momentum = PipelineParameter(name='momentum', default_value=0.9)

    model_dir = PipelineData(
        name='model_dir', 
        pipeline_output_name='model_dir',
        datastore=train_dir.datastore,
        output_mode='mount',
        is_directory=True)

    outputs = [model_dir]
    outputs_map = { 'model_dir': model_dir }

    estimator = PyTorch(
        source_directory=os.path.dirname(os.path.abspath(__file__)),
        entry_script='train.py',
        framework_version='1.3',
        compute_target=compute_target,
        use_gpu=True)

    step = EstimatorStep(
        name="Train Model",
        estimator=estimator,
        estimator_entry_script_arguments=[
            '--train_dir', train_dir, 
            '--valid_dir', valid_dir, 
            '--output_dir', model_dir, 
            '--num_epochs', num_epochs, 
            '--batch_size', batch_size,
            '--learning_rate', learning_rate, 
            '--momentum', momentum
        ],
        inputs=[train_dir, valid_dir],
        compute_target=compute_target,
        outputs=outputs,
        allow_reuse=False)

    return step, outputs_map
def data_preprocess_step(raw_data_dir, compute_target):

    run_config = RunConfiguration()
    run_config.environment.python.conda_dependencies = CondaDependencies.create(
        pip_packages=['pandas'])
    run_config.environment.docker.enabled = True

    train_dir = PipelineData(name='train_dir',
                             pipeline_output_name='train_dir',
                             datastore=raw_data_dir.datastore,
                             output_mode='mount',
                             is_directory=True)

    test_dir = PipelineData(name='test_dir',
                            pipeline_output_name='test_dir',
                            datastore=raw_data_dir.datastore,
                            output_mode='mount',
                            is_directory=True)

    outputs = [train_dir, test_dir]
    outputs_map = {
        'train_dir': train_dir,
        'test_dir': test_dir,
    }

    step = PythonScriptStep(script_name='data_preprocess.py',
                            arguments=[
                                '--raw_data_dir',
                                raw_data_dir,
                                '--train_dir',
                                train_dir,
                                '--test_dir',
                                test_dir,
                            ],
                            inputs=[raw_data_dir],
                            outputs=outputs,
                            compute_target=compute_target,
                            runconfig=run_config,
                            source_directory=os.path.dirname(
                                os.path.abspath(__file__)),
                            allow_reuse=True)

    return step, outputs_map
示例#17
0
def get_test_data(aml_interface):
    datastore = aml_interface.workspace.get_default_datastore()
    datastore_paths = [(datastore, PREDICTION_FILE)]
    dataset = Dataset.Tabular.from_delimited_files(path=datastore_paths)

    registered_iris_ds= dataset.register(workspace=aml_interface.workspace,
                                 name=PREDICTION_DATASET_NAME,create_new_version=True)
    named_iris_ds = registered_iris_ds.as_named_input(PREDICTION_DATASET_NAME)

    output_folder = PipelineData(name=PARALLEL_TASK_NAME, datastore=datastore)
    return named_iris_ds,output_folder
    def prepare(self):
        """
        Prepare a Port instance to be connected by assigning a PipelineData instance to its value

        """
        def _regular_name(port_name):
            # AML Service does not allow name with spaces. Replace them with underscore.
            return '_'.join(port_name.split())

        if not self.prepared:
            conn = PipelineData(_regular_name(self.name))
            self.value = conn
示例#19
0
    def setup_training_step(self):
        prepped_data = self.prepped_data_path.parse_parquet_files(
            file_extension=None)
        project_folder = './automl'

        automl_config = AutoMLConfig(compute_target=self.aml_compute,
                                     task="classification",
                                     training_data=prepped_data,
                                     label_column_name="test_result",
                                     path=project_folder,
                                     enable_early_stopping=True,
                                     featurization='auto',
                                     debug_log="automl_errors.log",
                                     n_cross_validations=10,
                                     **automl_settings)

        ds = self.ws.get_default_datastore()
        metrics_output_name = 'metrics_output'
        best_model_output_name = 'model_output'

        metrics_data = PipelineData(
            name='metrics_data',
            datastore=ds,
            pipeline_output_name=metrics_output_name,
            training_output=TrainingOutput(type='Metrics'))
        model_data = PipelineData(name='best_model_data',
                                  datastore=ds,
                                  pipeline_output_name=best_model_output_name,
                                  training_output=TrainingOutput(type='Model'))

        self.model_data = model_data

        automl_step = AutoMLStep(name='automl_module',
                                 automl_config=automl_config,
                                 passthru_automl_config=False,
                                 outputs=[metrics_data, model_data],
                                 allow_reuse=True)

        return automl_step
示例#20
0
def run_script(ws, datastore, pipeline_name, instructions):
    pipeline_steps = []
    last_output = []

    for i in range(len(instructions)):
        compute_target = get_or_create_compute_target(
            ws,
            compute_name=instructions[i]['name'],
            vm_size=instructions[i].get('vm_size', 'STANDARD_D2_V2'),
            min_nodes=instructions[i].get('min_nodes', 0),
            max_nodes=instructions[i].get('max_nodes', 4),
            idle_sec=instructions[i].get('idle_seconds_before_scale_down',
                                         120))
        run_config = create_runconfig(compute_target)

        # input directory in datastore
        if len(last_output) == 0:
            input_dir = None
            # input_dir = DataReference(
            #     datastore=datastore,
            #     data_reference_name=DATA_REFERENCE_NAME + str(i),
            #     path_on_datastore="flows/",
            #     mode='download'
            # )
        else:
            input_dir = last_output

        # output directory in datastore
        output_dir = PipelineData(
            name=DATA_REFERENCE_NAME + str(i),
            datastore=datastore,
            output_path_on_compute=OUTPUT_PATH_ON_COMPUTE)

        # create pipeline step
        pipeline_step = create_mlapp_pipeline_step(
            compute_target,
            run_config,
            source_directory=os.getcwd(),
            entry_script=os.path.join("deployment", "aml_flow.py"),
            input_dir=input_dir,
            output_dir=output_dir,
            param_name='config' + str(i))

        # add to pipeline
        pipeline_steps += pipeline_step

        # reference last output
        last_output.append(output_dir)

    publish_pipeline_endpoint(ws, pipeline_steps, pipeline_name)
    def _setup_pipelinedata(self, name, output_path=None):
        """
        helper function to setup a PipelineData object in AzureML

        :param str name: [required] name of the data object in AzureML
        :param str output_path: path on output datastore to write data to
        :returns: output_data
        :rtype: PipelineData
        """
        if output_path is not None:
            output_data = PipelineData(
                name=name,
                datastore=self.blob_ds,
                output_name=name,
                output_mode="mount",
                output_path_on_compute=output_path,
                is_directory=True,
            )
        else:
            output_data = PipelineData(
                name=name,
                datastore=self.ws.get_default_datastore(),
                output_name=name)
        return output_data
def data_ingestion_step(datastore, compute_target):
    '''
    This step will leverage Azure Cognitive Services to search the web for images 
    to create a dataset. This replicates the real-world scenario of data being 
    ingested from a constantly changing source. The same 10 classes in the CIFAR-10 dataset 
    will be used (airplane, automobile, bird, cat, deer, dog, frog, horse, ship, truck). 

    :param datastore: The datastore that will be used
    :type datastore: Datastore
    :param compute_target: The compute target to run the step on
    :type compute_target: ComputeTarget
    
    :return: The ingestion step, step outputs dictionary (keys: raw_data_dir)
    :rtype: PythonScriptStep, dict
    '''

    run_config = RunConfiguration()
    run_config.environment.environment_variables = {
        'BING_SEARCH_V7_SUBSCRIPTION_KEY': os.environ['BING_SEARCH_V7_SUBSCRIPTION_KEY'],
        'BING_SEARCH_V7_ENDPOINT': os.environ['BING_SEARCH_V7_ENDPOINT'],
        'AZURE_REGION': datastore._workspace.location
        }
    run_config.environment.docker.enabled = True

    num_images = PipelineParameter(name='num_images', default_value=25)

    raw_data_dir = PipelineData(
        name='raw_data_dir', 
        pipeline_output_name='raw_data_dir',
        datastore=datastore,
        output_mode='mount',
        is_directory=True)

    outputs = [raw_data_dir]
    outputs_map = { 'raw_data_dir': raw_data_dir }

    step = PythonScriptStep(
        name="Data Ingestion",
        script_name='data_ingestion.py',
        arguments=['--output_dir', raw_data_dir, '--num_images', num_images],
        outputs=outputs,
        compute_target=compute_target,
        source_directory=os.path.dirname(os.path.abspath(__file__)),
        runconfig=run_config,
        allow_reuse=True
    )

    return step, outputs_map
示例#23
0
def evaluate_step(model_dir, test_dir, compute_target):
    '''
    This step evaluates the trained model on the testing data and outputs the accuracy.

    :param model_dir: The reference to the directory containing the trained model
    :type model_dir: DataReference
    :param test_dir: The reference to the directory containing the testing data
    :type test_dir: DataReference
    :param compute_target: The compute target to run the step on
    :type compute_target: ComputeTarget
    
    :return: The evaluate step, step outputs dictionary (keys: accuracy_file)
    :rtype: EstimatorStep, dict
    '''

    accuracy_file = PipelineData(
        name='accuracy_file', 
        pipeline_output_name='accuracy_file',
        datastore=test_dir.datastore,
        output_mode='mount',
        is_directory=False)

    outputs = [accuracy_file]
    outputs_map = { 'accuracy_file': accuracy_file }
    
    estimator = PyTorch(
        source_directory=os.path.dirname(os.path.abspath(__file__)),
        entry_script='evaluate.py',
        framework_version='1.3',
        compute_target=compute_target,
        use_gpu=True)

    step = EstimatorStep(
        name="Evaluate Model",
        estimator=estimator,
        estimator_entry_script_arguments=[
            '--test_dir', test_dir, 
            '--model_dir', model_dir, 
            '--accuracy_file', accuracy_file
        ],
        inputs=[model_dir, test_dir],
        outputs=outputs,
        compute_target=compute_target,
        allow_reuse=True)

    return step, outputs_map
示例#24
0
    def get_pipeline_data(self, config):
        pipeline_data = []

        for c in config:            
            if c["type"] == StepArgParser.ARG_TYPE_PIPELINE_DATA:
                pconfig = c["config"]
                pname = pconfig["name"]
                pds = pconfig.get("datastore") or "default"

                if pds == "default":
                    use_ds = self.workspace.get_default_datastore()
                else:
                    use_ds = Datastore.get(workspace=self.workspace, datastore_name=pds)

                pd = PipelineData(pname, datastore=use_ds)

                pipeline_data.append(pd)        

        return pipeline_data
示例#25
0
def estimator(data, store, compute):
    estimator = Estimator(source_directory=os.path.dirname(
        os.path.abspath(__file__)),
                          compute_target=compute,
                          entry_script='train.py',
                          pip_packages=['azureml-dataprep', 'lightgbm'])

    output = PipelineData("output", datastore=store)

    step = EstimatorStep(name=os.path.basename(__file__),
                         estimator=estimator,
                         estimator_entry_script_arguments=[
                             '--input_dir', data, '--output_dir', output
                         ],
                         inputs=[data],
                         outputs=[output],
                         compute_target=estimator._compute_target,
                         allow_reuse=True)

    return step, output
示例#26
0
def deploy_step(model_dir, accuracy_file, test_dir, compute_target):
    '''
    This step registers and deploys a new model on its first run. In subsequent runs, it will only register 
    and deploy a new model if the training dataset has changed or the dataset did not change, but the accuracy improved.

    :param model_dir: The reference to the directory containing the trained model
    :type model_dir: DataReference
    :param accuracy_file: The reference to the file containing the evaluation accuracy
    :type accuracy_file: DataReference
    :param test_dir: The reference to the directory containing the testing data
    :type test_dir: DataReference
    :param compute_target: The compute target to run the step on
    :type compute_target: ComputeTarget
    
    :return: The preprocess step, step outputs dictionary (keys: scoring_url)
    :rtype: PythonScriptStep, dict
    '''

    scoring_url = PipelineData(name='scoring_url',
                               pipeline_output_name='scoring_url',
                               datastore=accuracy_file.datastore,
                               output_mode='mount',
                               is_directory=False)

    outputs = [scoring_url]
    outputs_map = {'scoring_url': scoring_url}

    step = PythonScriptStep(
        name="Deploy Model",
        script_name='deploy.py',
        arguments=[
            '--model_dir', model_dir, '--accuracy_file', accuracy_file,
            '--test_dir', test_dir, '--scoring_url', scoring_url
        ],
        inputs=[model_dir, accuracy_file, test_dir],
        outputs=outputs,
        compute_target=compute_target,
        source_directory=os.path.dirname(os.path.abspath(__file__)),
        allow_reuse=False)

    return step, outputs_map
示例#27
0
def register_step(datastore: Datastore, input_data: PipelineData,
                  compute: ComputeTarget,
                  build: str) -> (PipelineData, EstimatorStep):
    seer_model = PipelineData("model", datastore=datastore, is_directory=True)

    register = Estimator(source_directory='.',
                         compute_target=compute,
                         entry_script='register.py')

    registerStep = EstimatorStep(name='Model Registration',
                                 estimator=register,
                                 estimator_entry_script_arguments=[
                                     "--source_path", input_data,
                                     "--target_path", seer_model, "--build",
                                     build
                                 ],
                                 inputs=[input_data],
                                 outputs=[seer_model],
                                 compute_target=compute)

    return seer_model, registerStep
示例#28
0
def train_step(datastore: Datastore, input_data: PipelineData,
               compute: ComputeTarget) -> (PipelineData, EstimatorStep):
    seer_training = PipelineData("train",
                                 datastore=datastore,
                                 is_directory=True)

    train = Estimator(source_directory='.',
                      compute_target=compute,
                      entry_script='train.py',
                      use_gpu=True,
                      pip_requirements_file='requirements.txt')

    trainStep = EstimatorStep(name='Model Training',
                              estimator=train,
                              estimator_entry_script_arguments=[
                                  "--source_path", input_data, "--target_path",
                                  seer_training, "--epochs", 15, "--batch", 10,
                                  "--lr", 0.001
                              ],
                              inputs=[input_data],
                              outputs=[seer_training],
                              compute_target=compute)

    return seer_training, trainStep
def build_vocab_step(train_dir, compute_target):
    '''
    This step will take the raw data downloaded from the previous step,
    preprocess it, and split into train, valid, and test directories.
    
    :param train_dir: The reference to the directory containing the training data
    :type train_src: DataReference
    :param compute_target: The compute target to run the step on
    :type compute_target: ComputeTarget
    
    :return: The preprocess step, step outputs dictionary (keys: vocab_dir)
    :rtype: PythonScriptStep, dict
    '''

    run_config = RunConfiguration()
    run_config.environment.docker.enabled = True
    run_config.environment.docker.base_image = DEFAULT_CPU_IMAGE
    run_config.environment.python.user_managed_dependencies = False
    conda_packages = ['pytorch']
    run_config.environment.python.conda_dependencies = CondaDependencies.create(
        conda_packages=conda_packages)

    input_col = PipelineParameter(name='input_col', default_value='Title')
    output_col = PipelineParameter(name='output_col', default_value='Abstract')
    size = PipelineParameter(name='size', default_value=50000)
    freq_cutoff = PipelineParameter(name='freq_cutoff', default_value=2)

    vocab_dir = PipelineData(name='vocab_dir',
                             pipeline_output_name='vocab_dir',
                             datastore=train_dir.datastore,
                             output_mode='mount',
                             is_directory=True)

    outputs = [vocab_dir]
    outputs_map = {
        'vocab_dir': vocab_dir,
    }

    step = PythonScriptStep(name="Build Vocab",
                            script_name='build_vocab.py',
                            arguments=[
                                '--train_dir',
                                train_dir,
                                '--vocab_dir',
                                vocab_dir,
                                '--input_col',
                                input_col,
                                '--output_col',
                                output_col,
                                '--size',
                                size,
                                '--freq_cutoff',
                                freq_cutoff,
                            ],
                            inputs=[train_dir],
                            outputs=outputs,
                            compute_target=compute_target,
                            runconfig=run_config,
                            source_directory=os.path.dirname(
                                os.path.abspath(__file__)),
                            allow_reuse=True)

    return step, outputs_map
示例#30
0
# The Azure ML pipeline is composed of two steps:
#
#  - Data pre-processing which consist of one-hot encoding categorical features, normalization of the features set, spliting of dataset into training/testing sets and finally writing out the output to storage.
#
#  - Hyperdrive step that tune and train the deep kernel learning model using GPytorch and Pytorch estimator
#%% [markdown]
# ## Pipeline data input/output
#
# Here, we define the input and intermediary dataset that will be used by the pipeline steps.

#%%
input_dir = DataReference(datastore=default_store,
                          data_reference_name="input_data",
                          path_on_datastore="churn")

processed_dir = PipelineData(name='processed_data', datastore=default_store)

#%% [markdown]
# ## Pipeline 1st step: Data Preprocessing
#
# We start by defining the run configuration with the needed dependencies by the preprocessing step.
#
# In the cell that follow, we compose the first step of the pipeline.
#

#%%
cd = CondaDependencies()
cd.add_conda_package('pandas')
cd.add_conda_package('matplotlib')
cd.add_conda_package('numpy')
cd.add_conda_package('scikit-learn')