def main():
    e = Env()
    # Get Azure machine learning workspace
    aml_workspace = Workspace.get(name=e.workspace_name,
                                  subscription_id=e.subscription_id,
                                  resource_group=e.resource_group)
    print("get_workspace:")
    print(aml_workspace)

    # Get Azure machine learning cluster
    aml_compute = get_compute(aml_workspace, e.compute_name, e.vm_size)
    if aml_compute is not None:
        print("aml_compute:")
        print(aml_compute)

    # Create a reusable Azure ML environment
    environment = get_environment(aml_workspace,
                                  e.aml_env_name,
                                  create_new=False)  # NOQA: E501

    run_config = RunConfiguration()
    run_config.environment = environment

    if (e.datastore_name):
        datastore_name = e.datastore_name
    else:
        datastore_name = aml_workspace.get_default_datastore().name
    run_config.environment.environment_variables[
        "DATASTORE_NAME"] = datastore_name  # NOQA: E501
def get_run_configs(
    ws: Workspace, computetarget: ComputeTarget, env: Env
) -> Tuple[ParallelRunConfig, RunConfiguration]:
    """
    Creates the necessary run configurations required by the
    pipeline to enable parallelized scoring.

    :param ws: AML Workspace
    :param computetarget: AML Compute target
    :param env: Environment Variables

    :returns: Tuple[Scoring Run configuration, Score copy run configuration]
    """

    # get a conda environment for scoring
    environment = get_environment(
        ws,
        env.aml_env_name_scoring,
        conda_dependencies_file=env.aml_env_score_conda_dep_file,
        enable_docker=True,
        use_gpu=env.use_gpu_for_scoring,
        create_new=env.rebuild_env_scoring,
    )

    score_run_config = ParallelRunConfig(
        entry_script=env.batchscore_script_path,
        source_directory=env.sources_directory_train,
        error_threshold=10,
        output_action="append_row",
        compute_target=computetarget,
        node_count=env.max_nodes_scoring,
        environment=environment,
        run_invocation_timeout=300,
    )

    copy_run_config = RunConfiguration()
    copy_run_config.environment = get_environment(
        ws,
        env.aml_env_name_score_copy,
        conda_dependencies_file=env.aml_env_scorecopy_conda_dep_file,
        enable_docker=True,
        use_gpu=env.use_gpu_for_scoring,
        create_new=env.rebuild_env_scoring,
    )
    return (score_run_config, copy_run_config)
예제 #3
0
def main():
    e = Env()
    # Get Azure machine learning workspace
    aml_workspace = Workspace.get(
        name=e.workspace_name,
        subscription_id=e.subscription_id,
        resource_group=e.resource_group,
    )
    print("get_workspace:")
    print(aml_workspace)

    # Get Azure machine learning cluster
    aml_compute = get_compute(aml_workspace, e.compute_name, e.vm_size)
    if aml_compute is not None:
        print("aml_compute:")
        print(aml_compute)

    # Create a reusable Azure ML environment
    # Make sure to include `r-essentials'
    #   in h1c4driver/conda_dependencies.yml
    environment = get_environment(
        aml_workspace,
        e.aml_env_name,
        conda_dependencies_file=e.aml_env_train_conda_dep_file,
        create_new=e.rebuild_env,
    )  # NOQA: E501
    run_config = RunConfiguration()
    run_config.environment = environment

    train_step = PythonScriptStep(
        name="Train Model",
        script_name="train_with_r.py",
        compute_target=aml_compute,
        source_directory="h1c4driver/training/R",
        runconfig=run_config,
        allow_reuse=False,
    )
    print("Step Train created")

    steps = [train_step]

    train_pipeline = Pipeline(workspace=aml_workspace, steps=steps)
    train_pipeline.validate()
    published_pipeline = train_pipeline.publish(
        name=e.pipeline_name,
        description="Model training/retraining pipeline",
        version=e.build_id,
    )
    print(f"Published pipeline: {published_pipeline.name}")
    print(f"for build {published_pipeline.version}")
def main():
    e = Env()
    # Get Azure machine learning workspace
    aml_workspace = Workspace.get(
        name=e.workspace_name,
        subscription_id=e.subscription_id,
        resource_group=e.resource_group,
    )
    print("get_workspace:")
    print(aml_workspace)

    # Get Azure machine learning cluster
    aml_compute = get_compute(aml_workspace, e.compute_name, e.vm_size)
    if aml_compute is not None:
        print("aml_compute:")
        print(aml_compute)

    # Create a reusable Azure ML environment
    environment = get_environment(
        aml_workspace,
        e.aml_env_name,
        conda_dependencies_file=e.aml_env_train_conda_dep_file,
        create_new=e.rebuild_env,
    )  #
    run_config = RunConfiguration()
    run_config.environment = environment

    if e.datastore_name:
        datastore_name = e.datastore_name
    else:
        datastore_name = aml_workspace.get_default_datastore().name
    run_config.environment.environment_variables[
        "DATASTORE_NAME"] = datastore_name  # NOQA: E501

    model_name_param = PipelineParameter(
        name="model_name", default_value=e.model_name)  # NOQA: E501
    dataset_version_param = PipelineParameter(name="dataset_version",
                                              default_value=e.dataset_version)
    data_file_path_param = PipelineParameter(name="data_file_path",
                                             default_value="none")
    caller_run_id_param = PipelineParameter(name="caller_run_id",
                                            default_value="none")  # NOQA: E501

    # Get dataset name
    dataset_name = e.dataset_name

    # Check to see if dataset exists
    if dataset_name not in aml_workspace.datasets:
        # This call creates an example CSV from sklearn sample data. If you
        # have already bootstrapped your project, you can comment this line
        # out and use your own CSV.
        create_sample_data_csv()

        # Use a CSV to read in the data set.
        file_name = "safedriver.csv"

        if not os.path.exists(file_name):
            raise Exception(
                'Could not find CSV dataset at "%s". If you have bootstrapped your project, you will need to provide a CSV.'  # NOQA: E501
                % file_name)  # NOQA: E501

        # Upload file to default datastore in workspace
        datatstore = Datastore.get(aml_workspace, datastore_name)
        target_path = "training-data/"
        datatstore.upload_files(
            files=[file_name],
            target_path=target_path,
            overwrite=True,
            show_progress=False,
        )

        # Register dataset
        path_on_datastore = os.path.join(target_path, file_name)
        dataset = Dataset.Tabular.from_delimited_files(
            path=(datatstore, path_on_datastore))
        dataset = dataset.register(
            workspace=aml_workspace,
            name=dataset_name,
            description="safedriver training data",
            tags={"format": "CSV"},
            create_new_version=True,
        )

    # Create a PipelineData to pass data between steps
    pipeline_data = PipelineData(
        "pipeline_data", datastore=aml_workspace.get_default_datastore())

    train_step = PythonScriptStep(
        name="Train Model",
        script_name=e.train_script_path,
        compute_target=aml_compute,
        source_directory=e.sources_directory_train,
        outputs=[pipeline_data],
        arguments=[
            "--model_name",
            model_name_param,
            "--step_output",
            pipeline_data,
            "--dataset_version",
            dataset_version_param,
            "--data_file_path",
            data_file_path_param,
            "--caller_run_id",
            caller_run_id_param,
            "--dataset_name",
            dataset_name,
        ],
        runconfig=run_config,
        allow_reuse=True,
    )
    print("Step Train created")

    evaluate_step = PythonScriptStep(
        name="Evaluate Model ",
        script_name=e.evaluate_script_path,
        compute_target=aml_compute,
        source_directory=e.sources_directory_train,
        arguments=[
            "--model_name",
            model_name_param,
            "--allow_run_cancel",
            e.allow_run_cancel,
        ],
        runconfig=run_config,
        allow_reuse=False,
    )
    print("Step Evaluate created")

    register_step = PythonScriptStep(
        name="Register Model ",
        script_name=e.register_script_path,
        compute_target=aml_compute,
        source_directory=e.sources_directory_train,
        inputs=[pipeline_data],
        arguments=[
            "--model_name",
            model_name_param,
            "--step_input",
            pipeline_data,
        ],  # NOQA: E501
        runconfig=run_config,
        allow_reuse=False,
    )
    print("Step Register created")
    # Check run_evaluation flag to include or exclude evaluation step.
    if (e.run_evaluation).lower() == "true":
        print("Include evaluation step before register step.")
        evaluate_step.run_after(train_step)
        register_step.run_after(evaluate_step)
        steps = [train_step, evaluate_step, register_step]
    else:
        print("Exclude evaluation step and directly run register step.")
        register_step.run_after(train_step)
        steps = [train_step, register_step]

    train_pipeline = Pipeline(workspace=aml_workspace, steps=steps)
    train_pipeline._set_experiment_name
    train_pipeline.validate()
    published_pipeline = train_pipeline.publish(
        name=e.pipeline_name,
        description="Model training/retraining pipeline",
        version=e.build_id,
    )
    print(f"Published pipeline: {published_pipeline.name}")
    print(f"for build {published_pipeline.version}")
예제 #5
0
def main():
    e = Env()
    # Get Azure machine learning workspace
    aml_workspace = Workspace.get(name=e.workspace_name,
                                  subscription_id=e.subscription_id,
                                  resource_group=e.resource_group)
    print("get_workspace:")
    print(aml_workspace)

    # Get Azure machine learning cluster
    aml_compute = get_compute(aml_workspace, e.compute_name, e.vm_size)
    if aml_compute is not None:
        print("aml_compute:")
        print(aml_compute)

    # Create a reusable Azure ML environment
    environment = get_environment(aml_workspace,
                                  e.aml_env_name,
                                  create_new=False)  # NOQA: E501

    run_config = RunConfiguration()
    run_config.environment = environment

    if (e.datastore_name):
        datastore_name = e.datastore_name
    else:
        datastore_name = aml_workspace.get_default_datastore().name
    run_config.environment.environment_variables[
        "DATASTORE_NAME"] = datastore_name  # NOQA: E501

    model_name_param = PipelineParameter(name="model_name",
                                         default_value=e.model_name)
    dataset_version_param = PipelineParameter(name="dataset_version",
                                              default_value=e.dataset_version)
    data_file_path_param = PipelineParameter(name="data_file_path",
                                             default_value="none")
    caller_run_id_param = PipelineParameter(name="caller_run_id",
                                            default_value="none")

    # Get dataset name
    dataset_name = e.dataset_name

    # # Check to see if dataset exists
    # if (dataset_name not in aml_workspace.datasets):
    #     # Create dataset from lacemlops sample data
    #     sample_data = load_lacemlops()
    #     df = pd.DataFrame(
    #         data=sample_data.data,
    #         columns=sample_data.feature_names)
    #     df['Y'] = sample_data.target
    #     file_name = 'lacemlops.csv'
    #     df.to_csv(file_name, index=False)

    #     # Upload file to default datastore in workspace
    #     datatstore = Datastore.get(aml_workspace, datastore_name)
    #     target_path = 'training-data/'
    #     datatstore.upload_files(
    #         files=[file_name],
    #         target_path=target_path,
    #         overwrite=True,
    #         show_progress=False)

    #     # Register dataset
    #     path_on_datastore = os.path.join(target_path, file_name)
    #     dataset = Dataset.Tabular.from_delimited_files(
    #         path=(datatstore, path_on_datastore))
    #     dataset = dataset.register(
    #         workspace=aml_workspace,
    #         name=dataset_name,
    #         description='lacemlops training data',
    #         tags={'format': 'CSV'},
    #         create_new_version=True)

    # Create a PipelineData to pass data between steps
    pipeline_data = PipelineData(
        'pipeline_data', datastore=aml_workspace.get_default_datastore())

    train_step = PythonScriptStep(
        name="Train Model",
        script_name=e.train_script_path,
        compute_target=aml_compute,
        source_directory=e.sources_directory_train,
        outputs=[pipeline_data],
        arguments=[
            "--model_name",
            model_name_param,
            "--step_output",
            pipeline_data,
            "--dataset_version",
            dataset_version_param,
            "--data_file_path",
            data_file_path_param,
            "--caller_run_id",
            caller_run_id_param,
            "--dataset_name",
            dataset_name,
        ],
        runconfig=run_config,
        allow_reuse=False,
    )
    print("Step Train created")

    evaluate_step = PythonScriptStep(
        name="Evaluate Model ",
        script_name=e.evaluate_script_path,
        compute_target=aml_compute,
        source_directory=e.sources_directory_train,
        arguments=[
            "--model_name",
            model_name_param,
            "--allow_run_cancel",
            e.allow_run_cancel,
        ],
        runconfig=run_config,
        allow_reuse=False,
    )
    print("Step Evaluate created")

    register_step = PythonScriptStep(
        name="Register Model ",
        script_name=e.register_script_path,
        compute_target=aml_compute,
        source_directory=e.sources_directory_train,
        inputs=[pipeline_data],
        arguments=[
            "--model_name",
            model_name_param,
            "--step_input",
            pipeline_data,
        ],
        runconfig=run_config,
        allow_reuse=False,
    )
    print("Step Register created")
    # Check run_evaluation flag to include or exclude evaluation step.
    if ((e.run_evaluation).lower() == 'true'):
        print("Include evaluation step before register step.")
        evaluate_step.run_after(train_step)
        register_step.run_after(evaluate_step)
        steps = [train_step, evaluate_step, register_step]
    else:
        print("Exclude evaluation step and directly run register step.")
        register_step.run_after(train_step)
        steps = [train_step, register_step]

    train_pipeline = Pipeline(workspace=aml_workspace, steps=steps)
    train_pipeline._set_experiment_name
    train_pipeline.validate()
    published_pipeline = train_pipeline.publish(
        name=e.pipeline_name,
        description="Model training/retraining pipeline",
        version=e.build_id)
    print(f'Published pipeline: {published_pipeline.name}')
    print(f'for build {published_pipeline.version}')
예제 #6
0
def main():
    e = Env()
    # Get Azure machine learning workspace
    aml_workspace = Workspace.get(
        name=e.workspace_name,
        subscription_id=e.subscription_id,
        resource_group=e.resource_group,
    )
    print(f"get_workspace:{aml_workspace}")

    # Get Azure machine learning cluster
    aml_compute = get_compute(aml_workspace, e.compute_name, e.vm_size)
    if aml_compute is not None:
        print(f"aml_compute:{aml_compute}")

    # Create a reusable Azure ML environment
    environment = get_environment(
        aml_workspace,
        e.aml_env_name,
        create_new=e.rebuild_env,
        enable_docker=True,
        dockerfile='ml_model/preprocess/Dockerfile'
    )  #
    run_config = RunConfiguration()
    run_config.environment = environment

    if e.datastore_name:
        datastore_name = e.datastore_name
    else:
        datastore_name = aml_workspace.get_default_datastore().name
    run_config.environment.environment_variables["DATASTORE_NAME"] = datastore_name  # NOQA: E501

    datastore = Datastore(aml_workspace, name=datastore_name)
    data_file_path_param = PipelineParameter(name="data_file_path", default_value=e.dataset_name)  # NOQA: E501

    # The version of the input/output dataset can't be determined at pipeline publish time, only run time.  # NOQA: E501
    # Options to store output data:
    # Option 1: Use blob API to write output data. Otherwise, no way to dynamically change the output dataset based on PipelineParameter, # NOQA: E501
    #     The following will not work. It generate a path like "PipelineParameter_Name:data_file_path_Default:gear_images"  # NOQA: E501
    #         output_ds = OutputFileDatasetConfig(destination=(datastore, data_file_path_param))  # NOQA: E501
    #     This option means writing a file locally and upload to the datastore. Fewer dataset, more code.  # NOQA: E501
    # Option 2: Use a dynamic path in OutputFileDatasetConfig, and register a new dataset at completion  # NOQA: E501
    #     Output dataset can be mounted, so more dataset to maintain, less code.   # NOQA: E501
    # Using Option 2 below.
    output_dataset = OutputFileDatasetConfig(
        name=e.processed_dataset_name,
        destination=(datastore, "/dataset/{output-name}/{run-id}")
    ).register_on_complete(
        name=e.processed_dataset_name)

    preprocess_step = PythonScriptStep(
        name="Preprocess Data with OS cmd",
        script_name='preprocess/preprocess_os_cmd_aml.py',
        compute_target=aml_compute,
        source_directory=e.sources_directory_train,
        arguments=[
            "--dataset_name", e.dataset_name,
            "--datastore_name", datastore_name,
            "--data_file_path", data_file_path_param,
            "--output_dataset", output_dataset,
        ],
        runconfig=run_config,
        allow_reuse=False,
    )
    print("Step Preprocess OS cmd created")

    steps = [preprocess_step]
    preprocess_pipeline = Pipeline(workspace=aml_workspace, steps=steps)
    preprocess_pipeline._set_experiment_name
    preprocess_pipeline.validate()
    published_pipeline = preprocess_pipeline.publish(
        name=e.preprocessing_pipeline_name,
        description="Data preprocessing OS cmd pipeline",
        version=e.build_id,
    )
    print(f"Published pipeline: {published_pipeline.name}")
    print(f"for build {published_pipeline.version}")
def main():
    e = Env()
    aml_workspace = Workspace.get(
        name=e.workspace_name,
        subscription_id=e.subscription_id,
        resource_group=e.resource_group
    )
    print("get_workspace:")
    print(aml_workspace)

    aml_compute = get_compute(
        aml_workspace,
        e.compute_name,
        e.vm_size)
    if aml_compute is not None:
        print("aml_compute:")
        print(aml_compute)

    environment = get_environment(
        aml_workspace, e.aml_env_name, create_new=e.rebuild_env)
    run_config = RunConfiguration()
    run_config.environment = environment

    if (e.datastore_name):
        datastore_name = e.datastore_name
    else:
        datastore_name = aml_workspace.get_default_datastore().name

    run_config.environment.environment_variables["DATASTORE_NAME"] \
        = datastore_name

    dataset_name = e.dataset_name
    file_name = e.file_name
    datastore = Datastore.get(aml_workspace, datastore_name)

    if (dataset_name not in aml_workspace.datasets):
        raise Exception("Could not find dataset at \"%s\"." % dataset_name)
    else:
        dataset = Dataset.get_by_name(aml_workspace, name=dataset_name)
        dataset.download(target_path='.', overwrite=True)
        datastore.upload_files([file_name],
                               target_path=dataset_name,
                               overwrite=True)

    raw_data_file = DataReference(datastore=datastore,
                                  data_reference_name="Raw_Data_File",
                                  path_on_datastore=dataset_name + '/'
                                  + file_name)

    clean_data_file = PipelineParameter(name="clean_data_file",
                                        default_value="/clean_data.csv")
    clean_data_folder = PipelineData("clean_data_folder",
                                     datastore=datastore)

    prepDataStep = PythonScriptStep(name="Prepare Data",
                                    source_directory=e.sources_directory_train,
                                    script_name=e.data_prep_script_path,
                                    arguments=["--raw_data_file",
                                               raw_data_file,
                                               "--clean_data_folder",
                                               clean_data_folder,
                                               "--clean_data_file",
                                               clean_data_file],
                                    inputs=[raw_data_file],
                                    outputs=[clean_data_folder],
                                    compute_target=aml_compute,
                                    allow_reuse=False)

    print("Step Prepare Data created")

    new_model_file = PipelineParameter(name="new_model_file ",
                                       default_value='/' + e.model_name
                                       + '.pkl')
    new_model_folder = PipelineData("new_model_folder", datastore=datastore)
    est = SKLearn(source_directory=e.sources_directory_train,
                  entry_script=e.train_script_path,
                  pip_packages=['azureml-sdk', 'scikit-learn==0.20.3',
                                'azureml-dataprep[pandas,fuse]>=1.1.14'],
                  compute_target=aml_compute)

    trainingStep = EstimatorStep(
        name="Model Training",
        estimator=est,
        estimator_entry_script_arguments=["--clean_data_folder",
                                          clean_data_folder,
                                          "--new_model_folder",
                                          new_model_folder,
                                          "--clean_data_file",
                                          clean_data_file.default_value,
                                          "--new_model_file",
                                          new_model_file.default_value],
        runconfig_pipeline_params=None,
        inputs=[clean_data_folder],
        outputs=[new_model_folder],
        compute_target=aml_compute,
        allow_reuse=False)

    print("Step Train created")

    model_name_param = PipelineParameter(name="model_name",
                                         default_value=e.model_name)

    evaluateStep = PythonScriptStep(
        name="Evaluate Model",
        source_directory=e.sources_directory_train,
        script_name=e.evaluate_script_path,
        arguments=["--model_name", model_name_param],
        compute_target=aml_compute,
        allow_reuse=False)

    print("Step Evaluate created")

    registerStep = PythonScriptStep(
        name="Register Model",
        source_directory=e.sources_directory_train,
        script_name=e.register_script_path,
        arguments=["--new_model_folder", new_model_folder,
                   "--new_model_file", new_model_file,
                   "--model_name", model_name_param],
        inputs=[new_model_folder],
        compute_target=aml_compute,
        allow_reuse=False)

    print("Step Register created")

    if ((e.run_evaluation).lower() == 'true'):
        print("Include evaluation step before register step.")
        trainingStep.run_after(prepDataStep)
        evaluateStep.run_after(trainingStep)
        registerStep.run_after(evaluateStep)
    else:
        print("Exclude evaluation step and directly run register step.")
        trainingStep.run_after(prepDataStep)
        registerStep.run_after(trainingStep)

    pipeline = Pipeline(workspace=aml_workspace, steps=[registerStep])
    pipeline.validate()
    print("Pipeline is built")

    pipeline._set_experiment_name
    published_pipeline = pipeline.publish(
        name=e.pipeline_name,
        description="Predict Employee Retention Model training pipeline",
        version=e.build_id
    )
    print(f'Published pipeline: {published_pipeline.name}')
    print(f'for build {published_pipeline.version}')
예제 #8
0
def main():
    e = Env()
    print(e.__dict__)
    # Get Azure machine learning workspace
    aml_workspace = Workspace.get(name=e.workspace_name,
                                  subscription_id=e.subscription_id,
                                  resource_group=e.resource_group)
    print("get_workspace:")
    print(aml_workspace)

    # Get Azure machine learning cluster
    aml_compute = get_compute(aml_workspace, e.compute_name, e.vm_size)
    if aml_compute is not None:
        print("aml_compute:")
        print(aml_compute)

    # Create a reusable Azure ML environment
    environment = get_environment(aml_workspace,
                                  e.aml_env_name,
                                  create_new=e.rebuild_env)  #
    run_config = RunConfiguration()
    run_config.environment = environment

    if e.datastore_name:
        datastore_name = e.datastore_name
    else:
        datastore_name = aml_workspace.get_default_datastore().name
    run_config.environment.environment_variables[
        "DATASTORE_NAME"] = datastore_name  # NOQA: E501

    model_name_param = PipelineParameter(name="model_name",
                                         default_value=e.model_name)
    dataset_version_param = PipelineParameter(name="dataset_version",
                                              default_value=e.dataset_version)
    data_file_path_param = PipelineParameter(name="data_file_path",
                                             default_value="none")
    caller_run_id_param = PipelineParameter(name="caller_run_id",
                                            default_value="none")

    # Get dataset name
    dataset_name = e.dataset_name

    # Check to see if dataset exists
    if dataset_name not in aml_workspace.datasets:
        raise ValueError(
            f"can't find dataset {dataset_name} in datastore {datastore_name}")

    # Create PipelineData to pass data between steps
    model_data = PipelineData("model_data",
                              datastore=aml_workspace.get_default_datastore())
    train_ds = (PipelineData("train_ds",
                             datastore=aml_workspace.get_default_datastore()).
                as_dataset().parse_delimited_files().register(
                    name="train", create_new_version=True))
    test_ds = (PipelineData(
        "test_ds", datastore=aml_workspace.get_default_datastore()).as_dataset(
        ).parse_delimited_files().register(name="test",
                                           create_new_version=True))

    prepare_step = PythonScriptStep(
        name="Prepare Data",
        script_name=e.prepare_script_path,
        compute_target=aml_compute,
        source_directory=e.sources_directory_train,
        outputs=[train_ds, test_ds],
        arguments=[
            "--dataset_version", dataset_version_param, "--data_file_path",
            data_file_path_param, "--dataset_name", dataset_name,
            "--caller_run_id", caller_run_id_param, "--train_ds", train_ds,
            "--test_ds", test_ds
        ],
        runconfig=run_config,
        allow_reuse=True,
    )
    print("Step Prepare created")

    train_step = PythonScriptStep(
        name="Train Model",
        script_name=e.train_script_path,
        compute_target=aml_compute,
        source_directory=e.sources_directory_train,
        inputs=[
            train_ds.as_named_input("training_data"),
            test_ds.as_named_input("testing_data")
        ],
        outputs=[model_data],
        arguments=[
            "--model_name", model_name_param, "--model_data", model_data
        ],
        runconfig=run_config,
        allow_reuse=False,
    )
    print("Step Train created")

    evaluate_step = PythonScriptStep(
        name="Evaluate Model ",
        script_name=e.evaluate_script_path,
        compute_target=aml_compute,
        source_directory=e.sources_directory_train,
        arguments=[
            "--model_name",
            model_name_param,
            "--allow_run_cancel",
            e.allow_run_cancel,
        ],
        runconfig=run_config,
        allow_reuse=False,
    )
    print("Step Evaluate created")

    register_step = PythonScriptStep(
        name="Register Model ",
        script_name=e.register_script_path,
        compute_target=aml_compute,
        source_directory=e.sources_directory_train,
        inputs=[model_data],
        arguments=[
            "--model_name", model_name_param, "--step_input", model_data
        ],
        runconfig=run_config,
        allow_reuse=False,
    )
    print("Step Register created")
    # Check run_evaluation flag to include or exclude evaluation step.
    if (e.run_evaluation).lower() == "true":
        print("Include evaluation step before register step.")
        evaluate_step.run_after(train_step)
        register_step.run_after(evaluate_step)
        steps = [prepare_step, train_step, evaluate_step, register_step]
    else:
        print("Exclude evaluation step and directly run register step.")
        register_step.run_after(train_step)
        steps = [prepare_step, train_step, register_step]

    train_pipeline = Pipeline(workspace=aml_workspace, steps=steps)
    train_pipeline._set_experiment_name
    train_pipeline.validate()
    published_pipeline = train_pipeline.publish(
        name=e.pipeline_name,
        description="Model training/retraining pipeline",
        version=e.build_id)
    print(f"Published pipeline: {published_pipeline.name}")
    print(f"for build {published_pipeline.version}")
def main():
    e = Env()
    # Get Azure machine learning workspace
    aml_workspace = Workspace.get(
        name=e.workspace_name,
        subscription_id=e.subscription_id,
        resource_group=e.resource_group,
    )
    print(f"get_workspace:{aml_workspace}")

    # Get Azure machine learning cluster
    aml_compute = get_compute(aml_workspace, e.compute_name, e.vm_size)
    if aml_compute is not None:
        print(f"aml_compute:{aml_compute}")

    # Create a reusable Azure ML environment
    environment = get_environment(
        aml_workspace,
        e.aml_env_name,
        conda_dependencies_file=e.aml_env_train_conda_dep_file,
        create_new=e.rebuild_env,
    )  #
    run_config = RunConfiguration()
    run_config.environment = environment

    if e.datastore_name:
        datastore_name = e.datastore_name
    else:
        datastore_name = aml_workspace.get_default_datastore().name
    run_config.environment.environment_variables[
        "DATASTORE_NAME"] = datastore_name  # NOQA: E501

    # datastore and dataset names are fixed for this pipeline, however
    # data_file_path can be specified for registering new versions of dataset
    # Note that AML pipeline parameters don't take empty string as default, "" won't work  # NOQA: E501
    model_name_param = PipelineParameter(
        name="model_name", default_value=e.model_name)  # NOQA: E501
    data_file_path_param = PipelineParameter(
        name="data_file_path", default_value="nopath")  # NOQA: E501
    ml_params = PipelineParameter(name="ml_params",
                                  default_value="default")  # NOQA: E501

    # Create a PipelineData to pass data between steps
    pipeline_data = PipelineData(
        "pipeline_data", datastore=aml_workspace.get_default_datastore())

    train_step = PythonScriptStep(
        name="Train Model",
        script_name="train/train_aml.py",
        compute_target=aml_compute,
        source_directory=e.sources_directory_train,
        outputs=[pipeline_data],
        arguments=[
            "--model_name",
            model_name_param,
            "--step_output",
            pipeline_data,
            "--data_file_path",
            data_file_path_param,
            "--dataset_name",
            e.processed_dataset_name,
            "--datastore_name",
            datastore_name,
            "--ml_params",
            ml_params,
        ],
        runconfig=run_config,
        allow_reuse=True,
    )
    print("Step Train created")

    evaluate_step = PythonScriptStep(
        name="Evaluate Model ",
        script_name="evaluate/evaluate_model.py",
        compute_target=aml_compute,
        source_directory=e.sources_directory_train,
        arguments=[
            "--model_name",
            model_name_param,
            "--ml_params",
            ml_params,
        ],
        runconfig=run_config,
        allow_reuse=False,
    )
    print("Step Evaluate created")

    register_step = PythonScriptStep(
        name="Register Model ",
        script_name="register/register_model.py",
        compute_target=aml_compute,
        source_directory=e.sources_directory_train,
        inputs=[pipeline_data],
        arguments=[
            "--model_name",
            model_name_param,
            "--step_input",
            pipeline_data,
            "--ml_params",
            ml_params,
        ],
        runconfig=run_config,
        allow_reuse=False,
    )
    print("Step Register created")

    evaluate_step.run_after(train_step)
    register_step.run_after(evaluate_step)
    steps = [train_step, evaluate_step, register_step]

    train_pipeline = Pipeline(workspace=aml_workspace, steps=steps)
    train_pipeline._set_experiment_name
    train_pipeline.validate()
    published_pipeline = train_pipeline.publish(
        name=e.training_pipeline_name,
        description="Model training/retraining pipeline",
        version=e.build_id,
    )
    print(f"Published pipeline: {published_pipeline.name}")
    print(f"for build {published_pipeline.version}")