Пример #1
0
def create_runconfig(aml_compute, env=None):
    # Create a new runconfig object
    aml_run_config = RunConfiguration()

    # Use the aml_compute you created above.
    aml_run_config.target = aml_compute

    if env:
        aml_run_config.environment = env
    else:
        aml_run_config.environment = create_env_from_requirements()

    return aml_run_config
def main():
    e = Env()
    # Get Azure machine learning workspace
    aml_workspace = Workspace.get(name=e.workspace_name,
                                  subscription_id=e.subscription_id,
                                  resource_group=e.resource_group)
    print("get_workspace:")
    print(aml_workspace)

    # Get Azure machine learning cluster
    aml_compute = get_compute(aml_workspace, e.compute_name, e.vm_size)
    if aml_compute is not None:
        print("aml_compute:")
        print(aml_compute)

    # Create a reusable Azure ML environment
    environment = get_environment(aml_workspace,
                                  e.aml_env_name,
                                  create_new=False)  # NOQA: E501

    run_config = RunConfiguration()
    run_config.environment = environment

    if (e.datastore_name):
        datastore_name = e.datastore_name
    else:
        datastore_name = aml_workspace.get_default_datastore().name
    run_config.environment.environment_variables[
        "DATASTORE_NAME"] = datastore_name  # NOQA: E501
Пример #3
0
def create_run_config(azure_config: AzureConfig,
                      source_config: SourceConfig,
                      all_azure_dataset_ids: List[str],
                      all_dataset_mountpoints: List[str],
                      environment_name: str = "") -> ScriptRunConfig:
    """
    Creates a configuration to run the InnerEye training script in AzureML.
    :param azure_config: azure related configurations to use for model scale-out behaviour
    :param source_config: configurations for model execution, such as name and execution mode
    :param all_azure_dataset_ids: The name of all datasets on blob storage that will be used for this run.
    :param all_dataset_mountpoints: When using the datasets in AzureML, these are the per-dataset mount points.
    :param environment_name: If specified, try to retrieve the existing Python environment with this name. If that
    is not found, create one from the Conda files provided in `source_config`. This parameter is meant to be used
    when running inference for an existing model.
    :return: The configured script run.
    """
    dataset_consumptions = create_dataset_consumptions(
        azure_config, all_azure_dataset_ids, all_dataset_mountpoints)
    # AzureML seems to sometimes expect the entry script path in Linux format, hence convert to posix path
    entry_script_relative_path = source_config.entry_script.relative_to(
        source_config.root_folder).as_posix()
    logging.info(
        f"Entry script {entry_script_relative_path} ({source_config.entry_script} relative to "
        f"source directory {source_config.root_folder})")
    max_run_duration = None
    if azure_config.max_run_duration:
        max_run_duration = run_duration_string_to_seconds(
            azure_config.max_run_duration)
    workspace = azure_config.get_workspace()
    run_config = RunConfiguration(
        script=entry_script_relative_path,
        arguments=source_config.script_params,
    )
    run_config.environment = get_or_create_python_environment(
        azure_config, source_config, environment_name=environment_name)
    run_config.target = azure_config.cluster
    run_config.max_run_duration_seconds = max_run_duration
    if azure_config.num_nodes > 1:
        distributed_job_config = MpiConfiguration(
            node_count=azure_config.num_nodes)
        run_config.mpi = distributed_job_config
        run_config.framework = "Python"
        run_config.communicator = "IntelMpi"
        run_config.node_count = distributed_job_config.node_count
    if len(dataset_consumptions) > 0:
        run_config.data = {
            dataset.name: dataset
            for dataset in dataset_consumptions
        }
    # Use blob storage for storing the source, rather than the FileShares section of the storage account.
    run_config.source_directory_data_store = workspace.datastores.get(
        WORKSPACE_DEFAULT_BLOB_STORE_NAME).name
    script_run_config = ScriptRunConfig(
        source_directory=str(source_config.root_folder),
        run_config=run_config,
    )
    if azure_config.hyperdrive:
        script_run_config = source_config.hyperdrive_config_func(
            script_run_config)  # type: ignore
    return script_run_config
Пример #4
0
def create_run_config(azure_config: AzureConfig,
                      source_config: SourceConfig,
                      azure_dataset_id: str = "",
                      environment_name: str = "") -> ScriptRunConfig:
    """
    Creates a configuration to run the InnerEye training script in AzureML.
    :param azure_config: azure related configurations to use for model scale-out behaviour
    :param source_config: configurations for model execution, such as name and execution mode
    :param azure_dataset_id: The name of the dataset in blob storage to be used for this run. This can be an empty
    string to not use any datasets.
    :param environment_name: If specified, try to retrieve the existing Python environment with this name. If that
    is not found, create one from the Conda files provided in `source_config`. This parameter is meant to be used
    when running inference for an existing model.
    :return: The configured script run.
    """
    if azure_dataset_id:
        azureml_dataset = get_or_create_dataset(azure_config, azure_dataset_id=azure_dataset_id)
        if not azureml_dataset:
            raise ValueError(f"AzureML dataset {azure_dataset_id} could not be found or created.")
        named_input = azureml_dataset.as_named_input(INPUT_DATA_KEY)
        dataset_consumption = named_input.as_mount() if azure_config.use_dataset_mount else named_input.as_download()
    else:
        dataset_consumption = None
    # AzureML seems to sometimes expect the entry script path in Linux format, hence convert to posix path
    entry_script_relative_path = source_config.entry_script.relative_to(source_config.root_folder).as_posix()
    logging.info(f"Entry script {entry_script_relative_path} ({source_config.entry_script} relative to "
                 f"source directory {source_config.root_folder})")
    max_run_duration = None
    if azure_config.max_run_duration:
        max_run_duration = run_duration_string_to_seconds(azure_config.max_run_duration)
    workspace = azure_config.get_workspace()
    run_config = RunConfiguration(
        script=entry_script_relative_path,
        arguments=source_config.script_params,
    )
    run_config.environment = get_or_create_python_environment(azure_config, source_config,
                                                              environment_name=environment_name)
    run_config.target = azure_config.cluster
    run_config.max_run_duration_seconds = max_run_duration
    if azure_config.num_nodes > 1:
        distributed_job_config = MpiConfiguration(node_count=azure_config.num_nodes)
        run_config.mpi = distributed_job_config
        run_config.framework = "Python"
        run_config.communicator = "IntelMpi"
        run_config.node_count = distributed_job_config.node_count
    if dataset_consumption:
        run_config.data = {dataset_consumption.name: dataset_consumption}
    # Use blob storage for storing the source, rather than the FileShares section of the storage account.
    run_config.source_directory_data_store = workspace.datastores.get(WORKSPACE_DEFAULT_BLOB_STORE_NAME).name
    script_run_config = ScriptRunConfig(
        source_directory=str(source_config.root_folder),
        run_config=run_config,
    )
    if azure_config.hyperdrive:
        script_run_config = source_config.hyperdrive_config_func(script_run_config)  # type: ignore
    return script_run_config
Пример #5
0
    def get_run_config(self, config):

        environment_config = config.get("environment")
        environment = self.get_environment(environment_config)

        cluster_name = config.get("cluster")
        cluster = ComputeTarget(workspace=self.workspace, name=cluster_name)

        pipeline_run_config = RunConfiguration()
        pipeline_run_config.target = cluster
        pipeline_run_config.environment = environment

        return pipeline_run_config
Пример #6
0
def main():
    e = Env()
    # Get Azure machine learning workspace
    aml_workspace = Workspace.get(
        name=e.workspace_name,
        subscription_id=e.subscription_id,
        resource_group=e.resource_group,
    )
    print("get_workspace:")
    print(aml_workspace)

    # Get Azure machine learning cluster
    aml_compute = get_compute(aml_workspace, e.compute_name, e.vm_size)
    if aml_compute is not None:
        print("aml_compute:")
        print(aml_compute)

    # Create a reusable Azure ML environment
    # Make sure to include `r-essentials'
    #   in h1c4driver/conda_dependencies.yml
    environment = get_environment(
        aml_workspace,
        e.aml_env_name,
        conda_dependencies_file=e.aml_env_train_conda_dep_file,
        create_new=e.rebuild_env,
    )  # NOQA: E501
    run_config = RunConfiguration()
    run_config.environment = environment

    train_step = PythonScriptStep(
        name="Train Model",
        script_name="train_with_r.py",
        compute_target=aml_compute,
        source_directory="h1c4driver/training/R",
        runconfig=run_config,
        allow_reuse=False,
    )
    print("Step Train created")

    steps = [train_step]

    train_pipeline = Pipeline(workspace=aml_workspace, steps=steps)
    train_pipeline.validate()
    published_pipeline = train_pipeline.publish(
        name=e.pipeline_name,
        description="Model training/retraining pipeline",
        version=e.build_id,
    )
    print(f"Published pipeline: {published_pipeline.name}")
    print(f"for build {published_pipeline.version}")
def main():
    e = Env()
    # Get Azure machine learning workspace
    aml_workspace = Workspace.get(name=e.workspace_name,
                                  subscription_id=e.subscription_id,
                                  resource_group=e.resource_group)
    print("get_workspace:")
    print(aml_workspace)

    # Get Azure machine learning cluster
    aml_compute = get_compute(aml_workspace, e.compute_name, e.vm_size)
    if aml_compute is not None:
        print("aml_compute:")
        print(aml_compute)

    # Create a reusable run configuration environment
    # Read definition from diabetes_regression/azureml_environment.json
    # Make sure to include `r-essentials'
    #   in diabetes_regression/conda_dependencies.yml
    environment = Environment.load_from_directory(e.sources_directory_train)
    if (e.collection_uri is not None and e.teamproject_name is not None):
        builduri_base = e.collection_uri + e.teamproject_name
        builduri_base = builduri_base + "/_build/results?buildId="
        environment.environment_variables["BUILDURI_BASE"] = builduri_base
    environment.register(aml_workspace)

    run_config = RunConfiguration()
    run_config.environment = environment

    train_step = PythonScriptStep(
        name="Train Model",
        script_name="train_with_r.py",
        compute_target=aml_compute,
        source_directory="diabetes_regression/training/R",
        runconfig=run_config,
        allow_reuse=False,
    )
    print("Step Train created")

    steps = [train_step]

    train_pipeline = Pipeline(workspace=aml_workspace, steps=steps)
    train_pipeline.validate()
    published_pipeline = train_pipeline.publish(
        name=e.pipeline_name,
        description="Model training/retraining pipeline",
        version=e.build_id)
    print(f'Published pipeline: {published_pipeline.name}')
    print(f'for build {published_pipeline.version}')
Пример #8
0
def create_runconfig(aml_compute, env=None):
    # Create a new runconfig object
    aml_run_config = RunConfiguration()

    # Use the aml_compute you created above.
    aml_run_config.target = aml_compute

    if env is not None:
        aml_run_config.environment = env
    else:
        # Enable Docker
        aml_run_config.environment.docker.enabled = True

        # Set Docker base image to the default CPU-based image
        aml_run_config.environment.docker.base_image = "mcr.microsoft.com/azureml/base:0.2.1"

        # Use conda_dependencies.yml to create a conda environment in the Docker image for execution
        aml_run_config.environment.python.user_managed_dependencies = False

    return aml_run_config
Пример #9
0
# Build CPU image for Ray
ray_cpu_env = Environment.from_dockerfile(
    name=ray_environment_name, dockerfile=ray_environment_dockerfile_path)
ray_cpu_env.register(workspace=ws)
ray_cpu_build_details = ray_cpu_env.build(workspace=ws)

while ray_cpu_build_details.status not in ["Succeeded", "Failed"]:
    print(
        f"Awaiting completion of ray CPU environment build. Current status is: {ray_cpu_build_details.status}"
    )
    time.sleep(10)

command = ["python distribute_automl.py"]
env = Environment.get(workspace=ws, name=ray_environment_name)
compute_target = ws.compute_targets["cpucluster"]
aml_run_config = RunConfiguration(communicator="OpenMpi")
aml_run_config.target = compute_target
aml_run_config.docker = DockerConfiguration(use_docker=True)
aml_run_config.environment = env
aml_run_config.node_count = 2
config = ScriptRunConfig(
    source_directory="ray/",
    command=command,
    run_config=aml_run_config,
)

exp = Experiment(ws, "distribute-automl")
run = exp.submit(config)
print(run.get_portal_url())  # link to ml.azure.com
run.wait_for_completion(show_output=True)
Пример #10
0
def main():
    env = EnvironmentVariables()
    args = add_arguments()

    workspace = get_workspace()

    cpu_cluster_name = env.cpu_cluster_name
    compute = get_or_create_compute(workspace, cpu_cluster_name,
                                    env.compute_vm_size, env.max_nodes)

    environment = Environment.load_from_directory(env.sources_directory_train)
    environment.register(workspace)
    run_configuration = RunConfiguration()
    run_configuration.environment = environment

    model_name_param = PipelineParameter(name="model_name",
                                         default_value=env.model_name)
    build_id_param = PipelineParameter(name="build_id",
                                       default_value=env.build_id)
    should_tune_hyperparameters_param = PipelineParameter(
        name="should_tune_hyperparameters",
        default_value=env.should_tune_hyperparameters)
    parallelism_level_param = PipelineParameter(
        name="parallelism_level", default_value=env.parallelism_level)
    force_register_param = PipelineParameter(name="force_register",
                                             default_value=env.force_register)

    datastore = get_datastore()

    dataset_name = env.dataset_name
    dataset_path = env.dataset_path
    print(
        f"Creating new dataset version for {dataset_name} in datastore {datastore} from file {dataset_path}"
    )
    temp_dataset = Dataset.Tabular.from_delimited_files(path=[(datastore,
                                                               dataset_path)])
    dataset = temp_dataset.register(workspace=workspace,
                                    name=dataset_name,
                                    description=dataset_name,
                                    tags={'format': 'CSV'},
                                    create_new_version=True)

    train_output = PipelineData('train_output',
                                output_name='train_output',
                                datastore=datastore)

    train_step = PythonScriptStep(
        name="Train model",
        compute_target=compute,
        script_name=env.train_script_name,
        runconfig=run_configuration,
        inputs=[dataset.as_named_input('training')],
        outputs=[train_output],
        arguments=[
            "--build_id", build_id_param, "--model_name", model_name_param,
            "--parallelism_level", parallelism_level_param,
            "--should_tune_hyperparameters", should_tune_hyperparameters_param
        ],
        allow_reuse=False)

    evaluate_step = PythonScriptStep(name="Evaluate model",
                                     compute_target=compute,
                                     script_name=env.evaluate_script_name,
                                     runconfig=run_configuration,
                                     inputs=[train_output],
                                     arguments=[
                                         "--build_id", build_id_param,
                                         "--model_name", model_name_param,
                                         "--train_output", train_output,
                                         "--force_register",
                                         force_register_param
                                     ],
                                     allow_reuse=False)

    register_step = PythonScriptStep(name="Register model",
                                     compute_target=compute,
                                     script_name=env.register_script_name,
                                     runconfig=run_configuration,
                                     inputs=[train_output],
                                     arguments=[
                                         "--build_id", build_id_param,
                                         "--model_name", model_name_param,
                                         "--train_output", train_output
                                     ],
                                     allow_reuse=False)

    evaluate_step.run_after(train_step)
    register_step.run_after(evaluate_step)

    steps = [train_step, evaluate_step, register_step]

    train_pipeline = Pipeline(workspace=workspace, steps=steps)
    train_pipeline.validate()
    published_pipeline = train_pipeline.publish(
        name=env.pipeline_name,
        description="Train/Eval/Register if better pipeline",
        version=env.build_id)

    output_file_name = args.output_file_name
    if output_file_name:
        with open(output_file_name, "w") as output_file:
            output_file.write(published_pipeline.id)

    print(
        f"Published pipeline {published_pipeline.name} for build {published_pipeline.version}"
    )
Пример #11
0
def main():
    e = Env()
    # Get Azure machine learning workspace
    aml_workspace = Workspace.get(name=e.workspace_name,
                                  subscription_id=e.subscription_id,
                                  resource_group=e.resource_group)
    print("get_workspace:")
    print(aml_workspace)

    # Get Azure machine learning cluster
    aml_compute = get_compute(aml_workspace, e.compute_name, e.vm_size)
    if aml_compute is not None:
        print("aml_compute:")
        print(aml_compute)

    # Create a reusable Azure ML environment
    environment = get_environment(aml_workspace,
                                  e.aml_env_name,
                                  create_new=False)  # NOQA: E501

    run_config = RunConfiguration()
    run_config.environment = environment

    if (e.datastore_name):
        datastore_name = e.datastore_name
    else:
        datastore_name = aml_workspace.get_default_datastore().name
    run_config.environment.environment_variables[
        "DATASTORE_NAME"] = datastore_name  # NOQA: E501

    model_name_param = PipelineParameter(name="model_name",
                                         default_value=e.model_name)
    dataset_version_param = PipelineParameter(name="dataset_version",
                                              default_value=e.dataset_version)
    data_file_path_param = PipelineParameter(name="data_file_path",
                                             default_value="none")
    caller_run_id_param = PipelineParameter(name="caller_run_id",
                                            default_value="none")

    # Get dataset name
    dataset_name = e.dataset_name

    # # Check to see if dataset exists
    # if (dataset_name not in aml_workspace.datasets):
    #     # Create dataset from lacemlops sample data
    #     sample_data = load_lacemlops()
    #     df = pd.DataFrame(
    #         data=sample_data.data,
    #         columns=sample_data.feature_names)
    #     df['Y'] = sample_data.target
    #     file_name = 'lacemlops.csv'
    #     df.to_csv(file_name, index=False)

    #     # Upload file to default datastore in workspace
    #     datatstore = Datastore.get(aml_workspace, datastore_name)
    #     target_path = 'training-data/'
    #     datatstore.upload_files(
    #         files=[file_name],
    #         target_path=target_path,
    #         overwrite=True,
    #         show_progress=False)

    #     # Register dataset
    #     path_on_datastore = os.path.join(target_path, file_name)
    #     dataset = Dataset.Tabular.from_delimited_files(
    #         path=(datatstore, path_on_datastore))
    #     dataset = dataset.register(
    #         workspace=aml_workspace,
    #         name=dataset_name,
    #         description='lacemlops training data',
    #         tags={'format': 'CSV'},
    #         create_new_version=True)

    # Create a PipelineData to pass data between steps
    pipeline_data = PipelineData(
        'pipeline_data', datastore=aml_workspace.get_default_datastore())

    train_step = PythonScriptStep(
        name="Train Model",
        script_name=e.train_script_path,
        compute_target=aml_compute,
        source_directory=e.sources_directory_train,
        outputs=[pipeline_data],
        arguments=[
            "--model_name",
            model_name_param,
            "--step_output",
            pipeline_data,
            "--dataset_version",
            dataset_version_param,
            "--data_file_path",
            data_file_path_param,
            "--caller_run_id",
            caller_run_id_param,
            "--dataset_name",
            dataset_name,
        ],
        runconfig=run_config,
        allow_reuse=False,
    )
    print("Step Train created")

    evaluate_step = PythonScriptStep(
        name="Evaluate Model ",
        script_name=e.evaluate_script_path,
        compute_target=aml_compute,
        source_directory=e.sources_directory_train,
        arguments=[
            "--model_name",
            model_name_param,
            "--allow_run_cancel",
            e.allow_run_cancel,
        ],
        runconfig=run_config,
        allow_reuse=False,
    )
    print("Step Evaluate created")

    register_step = PythonScriptStep(
        name="Register Model ",
        script_name=e.register_script_path,
        compute_target=aml_compute,
        source_directory=e.sources_directory_train,
        inputs=[pipeline_data],
        arguments=[
            "--model_name",
            model_name_param,
            "--step_input",
            pipeline_data,
        ],
        runconfig=run_config,
        allow_reuse=False,
    )
    print("Step Register created")
    # Check run_evaluation flag to include or exclude evaluation step.
    if ((e.run_evaluation).lower() == 'true'):
        print("Include evaluation step before register step.")
        evaluate_step.run_after(train_step)
        register_step.run_after(evaluate_step)
        steps = [train_step, evaluate_step, register_step]
    else:
        print("Exclude evaluation step and directly run register step.")
        register_step.run_after(train_step)
        steps = [train_step, register_step]

    train_pipeline = Pipeline(workspace=aml_workspace, steps=steps)
    train_pipeline._set_experiment_name
    train_pipeline.validate()
    published_pipeline = train_pipeline.publish(
        name=e.pipeline_name,
        description="Model training/retraining pipeline",
        version=e.build_id)
    print(f'Published pipeline: {published_pipeline.name}')
    print(f'for build {published_pipeline.version}')
Пример #12
0
# Add the dependencies to the environment
fraud_env.python.conda_dependencies = fraud_packages

# Register the environment (just in case you want to use it again)
fraud_env.register(workspace=ws)
registered_env = Environment.get(ws, 'fraud-pipeline-env')

# Create a new runconfig object for the pipeline
pipeline_run_config = RunConfiguration()

# Use the compute you created above.
pipeline_run_config.target = pipeline_cluster

# Assign the environment to the run configuration
pipeline_run_config.environment = registered_env

print("Run configuration created.")

# Get the training dataset
fraud_ds = ws.datasets.get("creditcard")

# Create a PipelineData (temporary Data Reference) for the model folder
model_folder = PipelineData("model_folder",
                            datastore=ws.get_default_datastore())
#pipeline_data = PipelineData('pipeline_data',  datastore=default_ds)

data_ref = DataReference(datastore=default_ds,
                         data_reference_name='data_ref',
                         path_on_datastore="config/")
Пример #13
0
def main():
    e = Env()
    # Get Azure machine learning workspace
    aml_workspace = Workspace.get(
        name=e.workspace_name,
        subscription_id=e.subscription_id,
        resource_group=e.resource_group,
    )
    print(f"get_workspace:{aml_workspace}")

    # Get Azure machine learning cluster
    aml_compute = get_compute(aml_workspace, e.compute_name, e.vm_size)
    if aml_compute is not None:
        print(f"aml_compute:{aml_compute}")

    # Create a reusable Azure ML environment
    environment = get_environment(
        aml_workspace,
        e.aml_env_name,
        create_new=e.rebuild_env,
        enable_docker=True,
        dockerfile='ml_model/preprocess/Dockerfile'
    )  #
    run_config = RunConfiguration()
    run_config.environment = environment

    if e.datastore_name:
        datastore_name = e.datastore_name
    else:
        datastore_name = aml_workspace.get_default_datastore().name
    run_config.environment.environment_variables["DATASTORE_NAME"] = datastore_name  # NOQA: E501

    datastore = Datastore(aml_workspace, name=datastore_name)
    data_file_path_param = PipelineParameter(name="data_file_path", default_value=e.dataset_name)  # NOQA: E501

    # The version of the input/output dataset can't be determined at pipeline publish time, only run time.  # NOQA: E501
    # Options to store output data:
    # Option 1: Use blob API to write output data. Otherwise, no way to dynamically change the output dataset based on PipelineParameter, # NOQA: E501
    #     The following will not work. It generate a path like "PipelineParameter_Name:data_file_path_Default:gear_images"  # NOQA: E501
    #         output_ds = OutputFileDatasetConfig(destination=(datastore, data_file_path_param))  # NOQA: E501
    #     This option means writing a file locally and upload to the datastore. Fewer dataset, more code.  # NOQA: E501
    # Option 2: Use a dynamic path in OutputFileDatasetConfig, and register a new dataset at completion  # NOQA: E501
    #     Output dataset can be mounted, so more dataset to maintain, less code.   # NOQA: E501
    # Using Option 2 below.
    output_dataset = OutputFileDatasetConfig(
        name=e.processed_dataset_name,
        destination=(datastore, "/dataset/{output-name}/{run-id}")
    ).register_on_complete(
        name=e.processed_dataset_name)

    preprocess_step = PythonScriptStep(
        name="Preprocess Data with OS cmd",
        script_name='preprocess/preprocess_os_cmd_aml.py',
        compute_target=aml_compute,
        source_directory=e.sources_directory_train,
        arguments=[
            "--dataset_name", e.dataset_name,
            "--datastore_name", datastore_name,
            "--data_file_path", data_file_path_param,
            "--output_dataset", output_dataset,
        ],
        runconfig=run_config,
        allow_reuse=False,
    )
    print("Step Preprocess OS cmd created")

    steps = [preprocess_step]
    preprocess_pipeline = Pipeline(workspace=aml_workspace, steps=steps)
    preprocess_pipeline._set_experiment_name
    preprocess_pipeline.validate()
    published_pipeline = preprocess_pipeline.publish(
        name=e.preprocessing_pipeline_name,
        description="Data preprocessing OS cmd pipeline",
        version=e.build_id,
    )
    print(f"Published pipeline: {published_pipeline.name}")
    print(f"for build {published_pipeline.version}")
Пример #14
0
# 6. Submit the pipeline for execution

## Pipeline Parameters ##
# We need to tell the Pipeline what it needs to learn to see!

datapath = DataPath(datastore=datastore, path_on_datastore=datastorepath)
data_path_pipeline_param = (PipelineParameter(name="data",
                                              default_value=datapath),
                            DataPathComputeBinding(mode='mount'))

# Configuration for data prep and training steps #

dataprepEnvironment = Environment.from_pip_requirements(
    'dataprepenv', 'requirements-dataprepandtraining.txt')
dataprepRunConfig = RunConfiguration()
dataprepRunConfig.environment = dataprepEnvironment

## Data Process Step ##
# parse.py file parses the images in our data source #

seer_tfrecords = PipelineData("tfrecords_set",
                              datastore=datastore,
                              is_directory=True)

prepStep = PythonScriptStep('parse.py',
                            source_directory='.',
                            name='Data Preparation',
                            compute_target=compute,
                            arguments=[
                                "--source_path", data_path_pipeline_param,
                                "--target_path", seer_tfrecords
def main():
    e = Env()

    print('********************')
    print(e.source_directory)

    files = os.listdir('./aml_pipeline')
    for f in files:
        print(f)

    print('***************')

    workspace_name = e.workspace_name
    subscription_id = e.subscription_id
    resource_group = e.resource_group

    #Connect to AML Workspace
    print('workspace_name = ' + workspace_name)
    print('subscription_id = ' + subscription_id)
    print('resource_group = ' + resource_group)

    ws = Workspace.get(
        name=workspace_name,
        subscription_id=subscription_id,
        resource_group=resource_group,
    )

    print('Ready to use Azure ML {} to work with {}'.format(
        azureml.core.VERSION, ws.name))

    default_ds = ws.get_default_datastore()

    if 'diabetes dataset' not in ws.datasets:
        default_ds.upload_files(
            files=['diabetes.csv',
                   'diabetes2.csv'],  # Upload the diabetes csv files in /data
            target_path=
            'diabetes-data/',  # Put it in a folder path in the datastore
            overwrite=True,  # Replace existing files of the same name
            show_progress=True)

        #Create a tabular dataset from the path on the datastore (this may take a short while)
        tab_data_set = Dataset.Tabular.from_delimited_files(
            path=(default_ds, 'diabetes-data/*.csv'))

        # Register the tabular dataset
        try:
            tab_data_set = tab_data_set.register(workspace=ws,
                                                 name='diabetes dataset',
                                                 description='diabetes data',
                                                 tags={'format': 'CSV'},
                                                 create_new_version=True)
            print('Dataset registered.')
        except Exception as ex:
            print(ex)
    else:
        print('Dataset already registered.')

    # Create a folder for the pipeline step files
    experiment_folder = 'diabetes_pipeline'
    os.makedirs(experiment_folder, exist_ok=True)

    print(experiment_folder)

    cluster_name = "mmcomputecluster"

    try:
        # Check for existing compute target
        pipeline_cluster = ComputeTarget(workspace=ws, name=cluster_name)
        print('Found existing cluster, use it.')
    except ComputeTargetException:
        # If it doesn't already exist, create it
        try:
            compute_config = AmlCompute.provisioning_configuration(
                vm_size='STANDARD_DS11_V2', max_nodes=2)
            pipeline_cluster = ComputeTarget.create(ws, cluster_name,
                                                    compute_config)
            pipeline_cluster.wait_for_completion(show_output=True)
        except Exception as ex:
            print(ex)

    # Create a Python environment for the experiment
    diabetes_env = Environment("diabetes-pipeline-env")
    diabetes_env.python.user_managed_dependencies = False  # Let Azure ML manage dependencies
    diabetes_env.docker.enabled = True  # Use a docker container

    # Create a set of package dependencies
    diabetes_packages = CondaDependencies.create(
        conda_packages=[
            'scikit-learn', 'ipykernel', 'matplotlib', 'pandas', 'pip'
        ],
        pip_packages=[
            'azureml-defaults', 'azureml-dataprep[pandas]', 'pyarrow'
        ])

    # Add the dependencies to the environment
    diabetes_env.python.conda_dependencies = diabetes_packages

    # Register the environment
    diabetes_env.register(workspace=ws)
    registered_env = Environment.get(ws, 'diabetes-pipeline-env')

    # Create a new runconfig object for the pipeline
    pipeline_run_config = RunConfiguration()

    # Use the compute you created above.
    pipeline_run_config.target = pipeline_cluster

    # Assign the environment to the run configuration
    pipeline_run_config.environment = registered_env

    print("Run configuration created.")

    # Get the training dataset
    diabetes_ds = ws.datasets.get("diabetes dataset")

    # Create a PipelineData (temporary Data Reference) for the model folder
    prepped_data_folder = PipelineData("prepped_data_folder",
                                       datastore=ws.get_default_datastore())

    # Step 1, Run the data prep script
    prep_step = PythonScriptStep(name="Prepare Data",
                                 script_name="prep_diabetes.py",
                                 source_directory='./aml_pipeline',
                                 arguments=[
                                     '--input-data',
                                     diabetes_ds.as_named_input('raw_data'),
                                     '--prepped-data', prepped_data_folder
                                 ],
                                 outputs=[prepped_data_folder],
                                 compute_target=pipeline_cluster,
                                 runconfig=pipeline_run_config,
                                 allow_reuse=True)

    # Step 2, run the training script
    train_step = PythonScriptStep(
        name="Train and Register Model",
        source_directory='./aml_pipeline',
        script_name="train_diabetes.py",
        arguments=['--training-folder', prepped_data_folder],
        inputs=[prepped_data_folder],
        compute_target=pipeline_cluster,
        runconfig=pipeline_run_config,
        allow_reuse=True)

    print("Pipeline steps defined")

    pipeline_steps = [prep_step, train_step]
    pipeline = Pipeline(workspace=ws, steps=pipeline_steps)
    print("Pipeline is built.")

    # Create an experiment and run the pipeline
    experiment = Experiment(workspace=ws, name='jlg-exp')
    pipeline_run = experiment.submit(pipeline, regenerate_outputs=True)
    print("Pipeline submitted for execution.")
    pipeline_run.wait_for_completion(show_output=True)

    for run in pipeline_run.get_children():
        print(run.name, ':')
        metrics = run.get_metrics()
        for metric_name in metrics:
            print('\t', metric_name, ":", metrics[metric_name])

    for model in Model.list(ws):
        print(model.name, 'version:', model.version)
        for tag_name in model.tags:
            tag = model.tags[tag_name]
            print('\t', tag_name, ':', tag)
        for prop_name in model.properties:
            prop = model.properties[prop_name]
            print('\t', prop_name, ':', prop)
        print('\n')

    # Publish the pipeline from the run
    published_pipeline = pipeline_run.publish_pipeline(
        name="diabetes-training-pipeline",
        description="Trains diabetes model",
        version="1.0")

    published_pipeline

    rest_endpoint = published_pipeline.endpoint
    print(rest_endpoint)
def main():
    e = Env()
    # Get Azure machine learning workspace
    aml_workspace = Workspace.get(
        name=e.workspace_name,
        subscription_id=e.subscription_id,
        resource_group=e.resource_group
    )
    print("get_workspace:")
    print(aml_workspace)

    # Get Azure machine learning cluster
    aml_compute = get_compute(
        aml_workspace,
        e.compute_name,
        e.vm_size)
    if aml_compute is not None:
        print("aml_compute:")
        print(aml_compute)

    # Create a reusable run configuration environment
    # Read definition from diabetes_regression/azureml_environment.json
    environment = Environment.load_from_directory(e.sources_directory_train)
    if (e.collection_uri is not None and e.teamproject_name is not None):
        builduri_base = e.collection_uri + e.teamproject_name
        builduri_base = builduri_base + "/_build/results?buildId="
        environment.environment_variables["BUILDURI_BASE"] = builduri_base
    environment.register(aml_workspace)

    run_config = RunConfiguration()
    run_config.environment = environment

    model_name_param = PipelineParameter(
        name="model_name", default_value=e.model_name)
    build_id_param = PipelineParameter(
        name="build_id", default_value=e.build_id)

    dataset_name = ""
    if (e.datastore_name is not None and e.datafile_name is not None):
        dataset_name = e.dataset_name
        datastore = Datastore.get(aml_workspace, e.datastore_name)
        data_path = [(datastore, e.datafile_name)]
        dataset = Dataset.Tabular.from_delimited_files(path=data_path)
        dataset.register(workspace=aml_workspace,
                         name=e.dataset_name,
                         description="dataset with training data",
                         create_new_version=True)

    train_step = PythonScriptStep(
        name="Train Model",
        script_name=e.train_script_path,
        compute_target=aml_compute,
        source_directory=e.sources_directory_train,
        arguments=[
            "--build_id", build_id_param,
            "--model_name", model_name_param,
            "--dataset_name", dataset_name,
        ],
        runconfig=run_config,
        allow_reuse=False,
    )
    print("Step Train created")

    evaluate_step = PythonScriptStep(
        name="Evaluate Model ",
        script_name=e.evaluate_script_path,
        compute_target=aml_compute,
        source_directory=e.sources_directory_train,
        arguments=[
            "--build_id", build_id_param,
            "--model_name", model_name_param,
            "--allow_run_cancel", e.allow_run_cancel,
        ],
        runconfig=run_config,
        allow_reuse=False,
    )
    print("Step Evaluate created")

    register_step = PythonScriptStep(
        name="Register Model ",
        script_name=e.register_script_path,
        compute_target=aml_compute,
        source_directory=e.sources_directory_train,
        arguments=[
            "--build_id", build_id_param,
            "--model_name", model_name_param,
        ],
        runconfig=run_config,
        allow_reuse=False,
    )
    print("Step Register created")
    # Check run_evaluation flag to include or exclude evaluation step.
    if ((e.run_evaluation).lower() == 'true'):
        print("Include evaluation step before register step.")
        evaluate_step.run_after(train_step)
        register_step.run_after(evaluate_step)
        steps = [train_step, evaluate_step, register_step]
    else:
        print("Exclude evaluation step and directly run register step.")
        register_step.run_after(train_step)
        steps = [train_step, register_step]

    train_pipeline = Pipeline(workspace=aml_workspace, steps=steps)
    train_pipeline._set_experiment_name
    train_pipeline.validate()
    published_pipeline = train_pipeline.publish(
        name=e.pipeline_name,
        description="Model training/retraining pipeline",
        version=e.build_id
    )
    print(f'Published pipeline: {published_pipeline.name}')
    print(f'for build {published_pipeline.version}')
def main():
    e = Env()
    # Get Azure machine learning workspace
    aml_workspace = Workspace.get(
        name=e.workspace_name,
        subscription_id=e.subscription_id,
        resource_group=e.resource_group,
    )
    print("get_workspace:")
    print(aml_workspace)

    # Get Azure machine learning cluster
    aml_compute = get_compute(aml_workspace, e.compute_name, e.vm_size)
    if aml_compute is not None:
        print("aml_compute:")
        print(aml_compute)

    # Create a reusable Azure ML environment
    environment = get_environment(
        aml_workspace,
        e.aml_env_name,
        conda_dependencies_file=e.aml_env_train_conda_dep_file,
        create_new=e.rebuild_env,
    )  #
    run_config = RunConfiguration()
    run_config.environment = environment

    if e.datastore_name:
        datastore_name = e.datastore_name
    else:
        datastore_name = aml_workspace.get_default_datastore().name
    run_config.environment.environment_variables[
        "DATASTORE_NAME"] = datastore_name  # NOQA: E501

    model_name_param = PipelineParameter(
        name="model_name", default_value=e.model_name)  # NOQA: E501
    dataset_version_param = PipelineParameter(name="dataset_version",
                                              default_value=e.dataset_version)
    data_file_path_param = PipelineParameter(name="data_file_path",
                                             default_value="none")
    caller_run_id_param = PipelineParameter(name="caller_run_id",
                                            default_value="none")  # NOQA: E501

    # Get dataset name
    dataset_name = e.dataset_name

    # Check to see if dataset exists
    if dataset_name not in aml_workspace.datasets:
        # This call creates an example CSV from sklearn sample data. If you
        # have already bootstrapped your project, you can comment this line
        # out and use your own CSV.
        create_sample_data_csv()

        # Use a CSV to read in the data set.
        file_name = "safedriver.csv"

        if not os.path.exists(file_name):
            raise Exception(
                'Could not find CSV dataset at "%s". If you have bootstrapped your project, you will need to provide a CSV.'  # NOQA: E501
                % file_name)  # NOQA: E501

        # Upload file to default datastore in workspace
        datatstore = Datastore.get(aml_workspace, datastore_name)
        target_path = "training-data/"
        datatstore.upload_files(
            files=[file_name],
            target_path=target_path,
            overwrite=True,
            show_progress=False,
        )

        # Register dataset
        path_on_datastore = os.path.join(target_path, file_name)
        dataset = Dataset.Tabular.from_delimited_files(
            path=(datatstore, path_on_datastore))
        dataset = dataset.register(
            workspace=aml_workspace,
            name=dataset_name,
            description="safedriver training data",
            tags={"format": "CSV"},
            create_new_version=True,
        )

    # Create a PipelineData to pass data between steps
    pipeline_data = PipelineData(
        "pipeline_data", datastore=aml_workspace.get_default_datastore())

    train_step = PythonScriptStep(
        name="Train Model",
        script_name=e.train_script_path,
        compute_target=aml_compute,
        source_directory=e.sources_directory_train,
        outputs=[pipeline_data],
        arguments=[
            "--model_name",
            model_name_param,
            "--step_output",
            pipeline_data,
            "--dataset_version",
            dataset_version_param,
            "--data_file_path",
            data_file_path_param,
            "--caller_run_id",
            caller_run_id_param,
            "--dataset_name",
            dataset_name,
        ],
        runconfig=run_config,
        allow_reuse=True,
    )
    print("Step Train created")

    evaluate_step = PythonScriptStep(
        name="Evaluate Model ",
        script_name=e.evaluate_script_path,
        compute_target=aml_compute,
        source_directory=e.sources_directory_train,
        arguments=[
            "--model_name",
            model_name_param,
            "--allow_run_cancel",
            e.allow_run_cancel,
        ],
        runconfig=run_config,
        allow_reuse=False,
    )
    print("Step Evaluate created")

    register_step = PythonScriptStep(
        name="Register Model ",
        script_name=e.register_script_path,
        compute_target=aml_compute,
        source_directory=e.sources_directory_train,
        inputs=[pipeline_data],
        arguments=[
            "--model_name",
            model_name_param,
            "--step_input",
            pipeline_data,
        ],  # NOQA: E501
        runconfig=run_config,
        allow_reuse=False,
    )
    print("Step Register created")
    # Check run_evaluation flag to include or exclude evaluation step.
    if (e.run_evaluation).lower() == "true":
        print("Include evaluation step before register step.")
        evaluate_step.run_after(train_step)
        register_step.run_after(evaluate_step)
        steps = [train_step, evaluate_step, register_step]
    else:
        print("Exclude evaluation step and directly run register step.")
        register_step.run_after(train_step)
        steps = [train_step, register_step]

    train_pipeline = Pipeline(workspace=aml_workspace, steps=steps)
    train_pipeline._set_experiment_name
    train_pipeline.validate()
    published_pipeline = train_pipeline.publish(
        name=e.pipeline_name,
        description="Model training/retraining pipeline",
        version=e.build_id,
    )
    print(f"Published pipeline: {published_pipeline.name}")
    print(f"for build {published_pipeline.version}")
Пример #18
0
        "pillow==6.0.0",
        "tensorflow-gpu==1.15",
        "keras",
        "matplotlib",
        "seaborn",
    ])

env = Environment("prednet")
env.python.conda_dependencies = conda_dependencies
env.docker.enabled = True
env.docker.base_image = DEFAULT_GPU_IMAGE
env.register(ws)

# Runconfigs
runconfig = RunConfiguration()
runconfig.environment = env
print("PipelineData object created")

create_pipelines = PythonScriptStep(
    name="create pipelines",
    script_name="pipelines_slave.py",
    compute_target=cpu_compute_target,
    arguments=[
        "--cpu_compute_name", cpu_compute_name, "--gpu_compute_name",
        gpu_compute_name
    ],
    source_directory=script_folder,
    runconfig=runconfig,
    allow_reuse=False,
)
print("pipeline building step created")
def main():
    e = Env()
    # Get Azure machine learning workspace
    aml_workspace = Workspace.get(name=e.workspace_name,
                                  subscription_id=e.subscription_id,
                                  resource_group=e.resource_group)
    print("get_workspace:")
    print(aml_workspace)

    # Get Azure machine learning cluster
    aml_compute = get_compute(aml_workspace, e.compute_name, e.vm_size)
    if aml_compute is not None:
        print("aml_compute:")
        print(aml_compute)

    # Create a reusable run configuration environment
    # Read definition from diabetes_regression/azureml_environment.json
    environment = Environment.load_from_directory(e.sources_directory_train)
    if (e.collection_uri is not None and e.teamproject_name is not None):
        builduri_base = e.collection_uri + e.teamproject_name
        builduri_base = builduri_base + "/_build/results?buildId="
        environment.environment_variables["BUILDURI_BASE"] = builduri_base
    environment.register(aml_workspace)

    run_config = RunConfiguration()
    run_config.environment = environment

    model_name_param = PipelineParameter(name="model_name",
                                         default_value=e.model_name)
    build_id_param = PipelineParameter(name="build_id",
                                       default_value=e.build_id)

    # Get dataset name
    dataset_name = e.dataset_name

    # Check to see if dataset exists
    if (dataset_name not in aml_workspace.datasets):
        # Create dataset from diabetes sample data
        sample_data = load_diabetes()
        df = pd.DataFrame(data=sample_data.data,
                          columns=sample_data.feature_names)
        df['Y'] = sample_data.target
        file_name = 'diabetes.csv'
        df.to_csv(file_name, index=False)

        # Upload file to default datastore in workspace
        default_ds = aml_workspace.get_default_datastore()
        target_path = 'training-data/'
        default_ds.upload_files(files=[file_name],
                                target_path=target_path,
                                overwrite=True,
                                show_progress=False)

        # Register dataset
        path_on_datastore = os.path.join(target_path, file_name)
        dataset = Dataset.Tabular.from_delimited_files(
            path=(default_ds, path_on_datastore))
        dataset = dataset.register(workspace=aml_workspace,
                                   name=dataset_name,
                                   description='diabetes training data',
                                   tags={'format': 'CSV'},
                                   create_new_version=True)

    # Get the dataset
    dataset = Dataset.get_by_name(aml_workspace, dataset_name)

    # Create a PipelineData to pass data between steps
    pipeline_data = PipelineData(
        'pipeline_data', datastore=aml_workspace.get_default_datastore())

    train_step = PythonScriptStep(
        name="Train Model",
        script_name=e.train_script_path,
        compute_target=aml_compute,
        source_directory=e.sources_directory_train,
        inputs=[dataset.as_named_input('training_data')],
        outputs=[pipeline_data],
        arguments=[
            "--build_id", build_id_param, "--model_name", model_name_param,
            "--step_output", pipeline_data
        ],
        runconfig=run_config,
        allow_reuse=False,
    )
    print("Step Train created")

    evaluate_step = PythonScriptStep(
        name="Evaluate Model ",
        script_name=e.evaluate_script_path,
        compute_target=aml_compute,
        source_directory=e.sources_directory_train,
        arguments=[
            "--build_id",
            build_id_param,
            "--model_name",
            model_name_param,
            "--allow_run_cancel",
            e.allow_run_cancel,
        ],
        runconfig=run_config,
        allow_reuse=False,
    )
    print("Step Evaluate created")

    register_step = PythonScriptStep(
        name="Register Model ",
        script_name=e.register_script_path,
        compute_target=aml_compute,
        source_directory=e.sources_directory_train,
        inputs=[pipeline_data],
        arguments=[
            "--build_id",
            build_id_param,
            "--model_name",
            model_name_param,
            "--step_input",
            pipeline_data,
        ],
        runconfig=run_config,
        allow_reuse=False,
    )
    print("Step Register created")
    # Check run_evaluation flag to include or exclude evaluation step.
    if ((e.run_evaluation).lower() == 'true'):
        print("Include evaluation step before register step.")
        evaluate_step.run_after(train_step)
        register_step.run_after(evaluate_step)
        steps = [train_step, evaluate_step, register_step]
    else:
        print("Exclude evaluation step and directly run register step.")
        register_step.run_after(train_step)
        steps = [train_step, register_step]

    train_pipeline = Pipeline(workspace=aml_workspace, steps=steps)
    train_pipeline._set_experiment_name
    train_pipeline.validate()
    published_pipeline = train_pipeline.publish(
        name=e.pipeline_name,
        description="Model training/retraining pipeline",
        version=e.build_id)
    print(f'Published pipeline: {published_pipeline.name}')
    print(f'for build {published_pipeline.version}')
Пример #20
0
myenv.docker.base_image = "mcr.microsoft.com/azureml/base-gpu:openmpi3.1.2-cuda10.1-cudnn7-ubuntu18.04"
# comment out this environment variable if you don't have it set!
myenv.environment_variables = {'WANDB_API_KEY': os.environ['WANDB_API_KEY']}

## Environment: python section
conda_dep = CondaDependencies(conda_dependencies_file_path='environment.yml')
myenv.python.conda_dependencies = conda_dep

# create configuration for Run
# Use RunConfiguration to specify compute target / env deps part of run
run_config = RunConfiguration()

# Attach compute target to run config
run_config.framework = 'python'
run_config.target = "gpu-cluster"
# This doesn't actuallly do anything since my target is a persistent compute instead of amlcompute
run_config.amlcompute.vm_size = "Standard_NC24"
run_config.node_count = 1
run_config.environment = myenv

# ScriptRunConfig packaages together environment configuration of
# RunConfiguration with a script for training to create a **script** run.
""" RunConfiguration(script=None, arguments=None, framework=None, communicator=None,
 conda_dependencies=None, _history_enabled=None, _path=None, _name=None)
"""
src = ScriptRunConfig(source_directory='.',
                      script='rl_credit/examples/distractor_delay_expt.py',
                      run_config=run_config)

get_run_config_from_script_run(src).save(path='.azureml/train.runconfig')
def create_pipeline(workspace):
    # Retreive compute cluster
    compute_target = workspace.compute_targets[args.compute_target]

    # Setup batch scoring environment from conda dependencies
    environment = Environment.from_conda_specification(
        name=args.environment_name, file_path=args.environment_specification
    )

    # Add environment variables
    environment.environment_variables = {
        "APPLICATIONINSIGHTS_CONNECTION_STRING": args.ai_connection_string
    }

    # Enable docker run
    environment.docker.enabled = True

    # Create run config
    run_config = RunConfiguration()
    run_config.environment = environment

    # Retreive input and output datastores
    input_datastore = Datastore(workspace, args.input_datastore_name)
    output_datastore = Datastore(workspace, args.output_datastore_name)

    # Define build id parameter
    build_id_param = PipelineParameter("build_id", default_value=args.build_id)

    # Define input datapath parameter
    input_datapath = DataPath(datastore=input_datastore, path_on_datastore="")
    input_datapath_param = (
        PipelineParameter(name="input_datapath", default_value=input_datapath),
        DataPathComputeBinding(mode="mount"),
    )

    # Define output datapath parameter
    output_datapath = DataPath(datastore=output_datastore, path_on_datastore="")
    output_datapath_param = (
        PipelineParameter(name="output_datapath", default_value=output_datapath),
        DataPathComputeBinding(mode="mount"),
    )

    # Define score step for pipeline
    score_step = PythonScriptStep(
        name="score_data",
        compute_target=compute_target,
        source_directory="src/score",
        script_name="score.py",
        inputs=[input_datapath_param, output_datapath_param],
        runconfig=run_config,
        allow_reuse=False,
        arguments=[
            "--build_id",
            build_id_param,
            "--input_datapath",
            input_datapath_param,
            "--output_datapath",
            output_datapath_param,
        ],
    )

    # Define pipeline for batch scoring
    pipeline = Pipeline(workspace=workspace, steps=[score_step])

    return pipeline
def main():
    e = Env()
    aml_workspace = Workspace.get(
        name=e.workspace_name,
        subscription_id=e.subscription_id,
        resource_group=e.resource_group
    )
    print("get_workspace:")
    print(aml_workspace)

    aml_compute = get_compute(
        aml_workspace,
        e.compute_name,
        e.vm_size)
    if aml_compute is not None:
        print("aml_compute:")
        print(aml_compute)

    environment = get_environment(
        aml_workspace, e.aml_env_name, create_new=e.rebuild_env)
    run_config = RunConfiguration()
    run_config.environment = environment

    if (e.datastore_name):
        datastore_name = e.datastore_name
    else:
        datastore_name = aml_workspace.get_default_datastore().name

    run_config.environment.environment_variables["DATASTORE_NAME"] \
        = datastore_name

    dataset_name = e.dataset_name
    file_name = e.file_name
    datastore = Datastore.get(aml_workspace, datastore_name)

    if (dataset_name not in aml_workspace.datasets):
        raise Exception("Could not find dataset at \"%s\"." % dataset_name)
    else:
        dataset = Dataset.get_by_name(aml_workspace, name=dataset_name)
        dataset.download(target_path='.', overwrite=True)
        datastore.upload_files([file_name],
                               target_path=dataset_name,
                               overwrite=True)

    raw_data_file = DataReference(datastore=datastore,
                                  data_reference_name="Raw_Data_File",
                                  path_on_datastore=dataset_name + '/'
                                  + file_name)

    clean_data_file = PipelineParameter(name="clean_data_file",
                                        default_value="/clean_data.csv")
    clean_data_folder = PipelineData("clean_data_folder",
                                     datastore=datastore)

    prepDataStep = PythonScriptStep(name="Prepare Data",
                                    source_directory=e.sources_directory_train,
                                    script_name=e.data_prep_script_path,
                                    arguments=["--raw_data_file",
                                               raw_data_file,
                                               "--clean_data_folder",
                                               clean_data_folder,
                                               "--clean_data_file",
                                               clean_data_file],
                                    inputs=[raw_data_file],
                                    outputs=[clean_data_folder],
                                    compute_target=aml_compute,
                                    allow_reuse=False)

    print("Step Prepare Data created")

    new_model_file = PipelineParameter(name="new_model_file ",
                                       default_value='/' + e.model_name
                                       + '.pkl')
    new_model_folder = PipelineData("new_model_folder", datastore=datastore)
    est = SKLearn(source_directory=e.sources_directory_train,
                  entry_script=e.train_script_path,
                  pip_packages=['azureml-sdk', 'scikit-learn==0.20.3',
                                'azureml-dataprep[pandas,fuse]>=1.1.14'],
                  compute_target=aml_compute)

    trainingStep = EstimatorStep(
        name="Model Training",
        estimator=est,
        estimator_entry_script_arguments=["--clean_data_folder",
                                          clean_data_folder,
                                          "--new_model_folder",
                                          new_model_folder,
                                          "--clean_data_file",
                                          clean_data_file.default_value,
                                          "--new_model_file",
                                          new_model_file.default_value],
        runconfig_pipeline_params=None,
        inputs=[clean_data_folder],
        outputs=[new_model_folder],
        compute_target=aml_compute,
        allow_reuse=False)

    print("Step Train created")

    model_name_param = PipelineParameter(name="model_name",
                                         default_value=e.model_name)

    evaluateStep = PythonScriptStep(
        name="Evaluate Model",
        source_directory=e.sources_directory_train,
        script_name=e.evaluate_script_path,
        arguments=["--model_name", model_name_param],
        compute_target=aml_compute,
        allow_reuse=False)

    print("Step Evaluate created")

    registerStep = PythonScriptStep(
        name="Register Model",
        source_directory=e.sources_directory_train,
        script_name=e.register_script_path,
        arguments=["--new_model_folder", new_model_folder,
                   "--new_model_file", new_model_file,
                   "--model_name", model_name_param],
        inputs=[new_model_folder],
        compute_target=aml_compute,
        allow_reuse=False)

    print("Step Register created")

    if ((e.run_evaluation).lower() == 'true'):
        print("Include evaluation step before register step.")
        trainingStep.run_after(prepDataStep)
        evaluateStep.run_after(trainingStep)
        registerStep.run_after(evaluateStep)
    else:
        print("Exclude evaluation step and directly run register step.")
        trainingStep.run_after(prepDataStep)
        registerStep.run_after(trainingStep)

    pipeline = Pipeline(workspace=aml_workspace, steps=[registerStep])
    pipeline.validate()
    print("Pipeline is built")

    pipeline._set_experiment_name
    published_pipeline = pipeline.publish(
        name=e.pipeline_name,
        description="Predict Employee Retention Model training pipeline",
        version=e.build_id
    )
    print(f'Published pipeline: {published_pipeline.name}')
    print(f'for build {published_pipeline.version}')
Пример #23
0
def main():
    """Build pipeline."""
    # Environment variables
    env = Env()

    # Azure ML workspace
    aml_workspace = Workspace.get(
        name=env.workspace_name,
        subscription_id=env.subscription_id,
        resource_group=env.resource_group,
    )
    logger.info(f"Azure ML workspace: {aml_workspace}")

    # Azure ML compute cluster
    aml_compute = get_compute(aml_workspace, env.compute_name)
    logger.info(f"Aazure ML compute cluster: {aml_compute}")

    # Azure ML environment
    environment = Environment(name=env.aml_env_name)
    conda_dep = CondaDependencies(
        conda_dependencies_file_path="./local_development/dev_dependencies.yml"
    )
    environment.python.conda_dependencies = conda_dep

    run_config = RunConfiguration()
    run_config.environment = environment

    # Pipeline Data
    preparation_pipelinedata = PipelineData("preparation_pipelinedata",
                                            is_directory=True).as_dataset()
    extraction_pipelinedata = PipelineData("extraction_pipelinedata",
                                           is_directory=True)
    training_pipelinedata = PipelineData("training_pipelinedata",
                                         is_directory=True)

    # List of pipeline steps
    step_list = list()
    preparation_step = PythonScriptStep(
        name="preparation-step",
        compute_target=aml_compute,
        source_directory=env.sources_directory_train,
        script_name=env.preparation_step_script_path,
        outputs=[preparation_pipelinedata],
        arguments=[
            "--input_path", env.input_dir, "--output_path",
            preparation_pipelinedata, "--datastore_name",
            env.blob_datastore_name
        ],
        runconfig=run_config)

    step_list.append(preparation_step)

    parallel_run_config = ParallelRunConfig(
        source_directory=env.sources_directory_train,
        entry_script=env.extraction_step_script_path,
        mini_batch_size=env.mini_batch_size,
        error_threshold=env.error_threshold,
        output_action="append_row",
        environment=environment,
        compute_target=aml_compute,
        node_count=env.node_count,
        run_invocation_timeout=env.run_invocation_timeout,
        process_count_per_node=env.process_count_per_node,
        append_row_file_name="extraction_output.txt")

    extraction_step = ParallelRunStep(
        name="extraction-step",
        inputs=[preparation_pipelinedata],
        output=extraction_pipelinedata,
        arguments=["--output_dir", extraction_pipelinedata],
        parallel_run_config=parallel_run_config)
    step_list.append(extraction_step)

    training_step = PythonScriptStep(
        name="traning-step",
        compute_target=aml_compute,
        source_directory=env.sources_directory_train,
        script_name=env.training_step_script_path,
        inputs=[extraction_pipelinedata],
        outputs=[training_pipelinedata],
        arguments=[
            "--input_dir", extraction_pipelinedata, "--output_dir",
            training_pipelinedata
        ],
        runconfig=run_config)

    step_list.append(training_step)

    # Build pipeline
    pipeline = Pipeline(workspace=aml_workspace, steps=step_list)
    pipeline.validate()
    logger.info(f"Built pipeline {pipeline}")

    # Publish pipeline
    published_pipeline = pipeline.publish(
        env.pipeline_name,
        description=env.pipeline_name,
        version=datetime.utcnow().isoformat())
    try:
        pipeline_endpoint = PipelineEndpoint.get(
            workspace=aml_workspace, name=env.pipeline_endpoint_name)
        pipeline_endpoint.add_default(published_pipeline)
    except ErrorResponseException:
        pipeline_endpoint = PipelineEndpoint.publish(
            workspace=aml_workspace,
            name=env.pipeline_endpoint_name,
            pipeline=published_pipeline,
            description=env.pipeline_endpoint_name)
Пример #24
0
def main():
    e = Env()
    print(e.__dict__)
    # Get Azure machine learning workspace
    aml_workspace = Workspace.get(name=e.workspace_name,
                                  subscription_id=e.subscription_id,
                                  resource_group=e.resource_group)
    print("get_workspace:")
    print(aml_workspace)

    # Get Azure machine learning cluster
    aml_compute = get_compute(aml_workspace, e.compute_name, e.vm_size)
    if aml_compute is not None:
        print("aml_compute:")
        print(aml_compute)

    # Create a reusable Azure ML environment
    environment = get_environment(aml_workspace,
                                  e.aml_env_name,
                                  create_new=e.rebuild_env)  #
    run_config = RunConfiguration()
    run_config.environment = environment

    if e.datastore_name:
        datastore_name = e.datastore_name
    else:
        datastore_name = aml_workspace.get_default_datastore().name
    run_config.environment.environment_variables[
        "DATASTORE_NAME"] = datastore_name  # NOQA: E501

    model_name_param = PipelineParameter(name="model_name",
                                         default_value=e.model_name)
    dataset_version_param = PipelineParameter(name="dataset_version",
                                              default_value=e.dataset_version)
    data_file_path_param = PipelineParameter(name="data_file_path",
                                             default_value="none")
    caller_run_id_param = PipelineParameter(name="caller_run_id",
                                            default_value="none")

    # Get dataset name
    dataset_name = e.dataset_name

    # Check to see if dataset exists
    if dataset_name not in aml_workspace.datasets:
        raise ValueError(
            f"can't find dataset {dataset_name} in datastore {datastore_name}")

    # Create PipelineData to pass data between steps
    model_data = PipelineData("model_data",
                              datastore=aml_workspace.get_default_datastore())
    train_ds = (PipelineData("train_ds",
                             datastore=aml_workspace.get_default_datastore()).
                as_dataset().parse_delimited_files().register(
                    name="train", create_new_version=True))
    test_ds = (PipelineData(
        "test_ds", datastore=aml_workspace.get_default_datastore()).as_dataset(
        ).parse_delimited_files().register(name="test",
                                           create_new_version=True))

    prepare_step = PythonScriptStep(
        name="Prepare Data",
        script_name=e.prepare_script_path,
        compute_target=aml_compute,
        source_directory=e.sources_directory_train,
        outputs=[train_ds, test_ds],
        arguments=[
            "--dataset_version", dataset_version_param, "--data_file_path",
            data_file_path_param, "--dataset_name", dataset_name,
            "--caller_run_id", caller_run_id_param, "--train_ds", train_ds,
            "--test_ds", test_ds
        ],
        runconfig=run_config,
        allow_reuse=True,
    )
    print("Step Prepare created")

    train_step = PythonScriptStep(
        name="Train Model",
        script_name=e.train_script_path,
        compute_target=aml_compute,
        source_directory=e.sources_directory_train,
        inputs=[
            train_ds.as_named_input("training_data"),
            test_ds.as_named_input("testing_data")
        ],
        outputs=[model_data],
        arguments=[
            "--model_name", model_name_param, "--model_data", model_data
        ],
        runconfig=run_config,
        allow_reuse=False,
    )
    print("Step Train created")

    evaluate_step = PythonScriptStep(
        name="Evaluate Model ",
        script_name=e.evaluate_script_path,
        compute_target=aml_compute,
        source_directory=e.sources_directory_train,
        arguments=[
            "--model_name",
            model_name_param,
            "--allow_run_cancel",
            e.allow_run_cancel,
        ],
        runconfig=run_config,
        allow_reuse=False,
    )
    print("Step Evaluate created")

    register_step = PythonScriptStep(
        name="Register Model ",
        script_name=e.register_script_path,
        compute_target=aml_compute,
        source_directory=e.sources_directory_train,
        inputs=[model_data],
        arguments=[
            "--model_name", model_name_param, "--step_input", model_data
        ],
        runconfig=run_config,
        allow_reuse=False,
    )
    print("Step Register created")
    # Check run_evaluation flag to include or exclude evaluation step.
    if (e.run_evaluation).lower() == "true":
        print("Include evaluation step before register step.")
        evaluate_step.run_after(train_step)
        register_step.run_after(evaluate_step)
        steps = [prepare_step, train_step, evaluate_step, register_step]
    else:
        print("Exclude evaluation step and directly run register step.")
        register_step.run_after(train_step)
        steps = [prepare_step, train_step, register_step]

    train_pipeline = Pipeline(workspace=aml_workspace, steps=steps)
    train_pipeline._set_experiment_name
    train_pipeline.validate()
    published_pipeline = train_pipeline.publish(
        name=e.pipeline_name,
        description="Model training/retraining pipeline",
        version=e.build_id)
    print(f"Published pipeline: {published_pipeline.name}")
    print(f"for build {published_pipeline.version}")
Пример #25
0
            'azureml-sdk', 'PyYAML', 'azure-storage-blob', 'matplotlib',
            'seaborn', 'tensorflow', 'Keras', 'tensorflow-hub', 'joblib',
            'tqdm', 'Pillow', 'azureml-dataprep[pandas,fuse]>=1.1.14'
        ])

    diagnoz_env = Environment("diagnoz-pipeline-env")
    diagnoz_env.python.user_managed_dependencies = False  # Let Azure ML manage dependencies
    diagnoz_env.docker.enabled = True  # Use a docker container
    diagnoz_env.docker.base_image = DEFAULT_GPU_IMAGE
    diagnoz_env.python.conda_dependencies = packages
    diagnoz_env.register(workspace=ws)

    # Runconfigs
    pipeline_run_config = RunConfiguration()
    pipeline_run_config.target = compute_target
    pipeline_run_config.environment = diagnoz_env
    print("Run configuration created.")

    shutil.rmtree(script_folder, ignore_errors=True)
    os.makedirs(script_folder, exist_ok=True)

    #copy all necessary scripts
    files = FilesProviders.get_path_files(
        "../", [os.path.basename(__file__), "__init__.py"])

    for f in files:
        shutil.copy(f, script_folder)
    #add generated config file to script folder
    shutil.copy(generated_config_file, script_folder)

    input_data = input_dataset.as_named_input('input_dataset').as_mount()
Пример #26
0
    compute_target.wait_for_completion(show_output=True,
                                       min_node_count=None,
                                       timeout_in_minutes=20)

    # For a more detailed view of current cluster status, use the 'status' property
    print(compute_target.status.serialize())

aml_run_config = RunConfiguration()
# `compute_target` as defined in "Azure Machine Learning compute" section above
aml_run_config.target = compute_target

USE_CURATED_ENV = True
if USE_CURATED_ENV:
    curated_environment = Environment.get(workspace=ws,
                                          name="AzureML-Tutorial")
    aml_run_config.environment = curated_environment
else:
    aml_run_config.environment.python.user_managed_dependencies = False

    # Add some packages relied on by data prep step
    aml_run_config.environment.python.conda_dependencies = CondaDependencies.create(
        conda_packages=['pandas', 'scikit-learn', 'seaborn', 'tqdm'],
        pip_packages=[
            'azureml-sdk', 'azureml-dataprep[fuse,pandas]', 'seaborn', 'tqdm'
        ],
        pin_sdk_version=False)

web_path = 'https://dprepdata.blob.core.windows.net/demo/Titanic.csv'
my_dataset = Dataset.Tabular.from_delimited_files(
    path=web_path, set_column_types={'Survived': DataType.to_bool()})
Пример #27
0
def main():
    e = ENV()
    aml_workspace = Workspace.get(
                                  name=e.workspace_name,
                                  subscription_id=e.subscription_id,
                                  resource_group=e.resource_group
                                  )
    print(f'workspace:{aml_workspace}')
    
    aml_compute = get_compute(workspace=aml_workspace,
                              compute_name=e.compute_name,
                              vm_size=e.vm_size)
    if aml_compute is not None:
        print(f'compute target: {aml_compute} is created')
    
    environment = get_environment(workspace=aml_workspace,
                                  env_name=e.aml_env_name,
                                  conda_dependencies=e.aml_conda_train_dependent_files,
                                  create_new=e.rebuild_env)
    run_config = RunConfiguration()
    run_config.environment = environment
    
    if e.datastore_name:
        datastore_name = e.datastore_name
    else:
        datastore_name = aml_workspace.get_default_datastore().name
    run_config.environment.environment_variables['DATASTORE_NAME'] = datastore_name
    
    model_name_param = PipelineParameter(name='model_name',default_value=e.model_name)
    dataset_version_param = PipelineParameter(name='dataset_version',default_value=e.dataset_version)
    dataset_file_path = PipelineParameter(name='dataset_file_path',default_value='none')
    
    dataset_name = e.dataset_name
    if dataset_name not in aml_workspace.datasets:
        create_sample_data_csv()
        file_name = 'diabetes.csv'
        if not os.path.exists(file_name):
            raise Exception(f'{file_name} does not exist!')
        datastore = Datastore.get(aml_workspace,datastore_name)
        target_path = 'training/'
        datastore.upload_files(files=[file_name],
                               target_path=target_path,
                               overwrite=True,
                               show_progress=True)
        path_on_datastore = os.path.join(target_path,file_name)
        dataset = Dataset.Tabular.from_delimited_files(path=(datastore,path_on_datastore))
        dataset.register(workspace=aml_workspace,
                         name=dataset_name,
                         description='registered dataset',
                         create_new_version=True,
                         tags={'format':'CSV'})
    
    pipeline_data = PipelineData('train',datastore=aml_workspace.get_default_datastore())
    train_step = PythonScriptStep(script_name=e.train_script_path,
                                  name='train_step',
                                  arguments=['--model-name',model_name_param,
                                             '--dataset-name',dataset_name,
                                             '--dataset-version',dataset_version_param,
                                             '--dataset-file-path',dataset_file_path,
                                             '--step-output',pipeline_data],
                                  compute_target=aml_compute,
                                  runconfig=run_config,
                                  source_directory=e.source_train_directory,
                                  outputs=[pipeline_data],
                                  allow_reuse=False)
    print('Train step created!')    
    
    eval_step = PythonScriptStep(name='eval_step',
                                 script_name=e.eval_script_path,
                                 compute_target=aml_compute,
                                 source_directory=e.source_train_directory,
                                 arguments=['--model-name',model_name_param,
                                            '--allow-run-cancel',e.allow_run_cancel],
                                 runconfig=run_config,
                                 allow_reuse=False)
    print('EVAL step created!')
    
    register_step = PythonScriptStep(name='register_step',
                                     script_name=e.register_script_path,
                                     compute_target=aml_compute,
                                     source_directory=e.source_train_directory,
                                     inputs=[pipeline_data],
                                     arguments=['--model-name',model_name_param,
                                                '--step-input',pipeline_data],
                                     runconfig=run_config,
                                     allow_reuse=False)
    print('Register step created!')
    
    if e.run_evaluation:
        print('evaluation step is included')
        eval_step.run_after(train_step)
        register_step.run_after(eval_step)
        steps = [train_step,eval_step,register_step]
    else:
        print('evaluation step is excluded')
        register_step.run_after(train_step)
        steps = [train_step,register_step]
        
    train_pipeline = Pipeline(workspace=aml_workspace,
                              steps=steps)
    train_pipeline.validate()
    published_pipeline = train_pipeline.publish(name=e.pipeline_name,
                                                description='model training pipeline',
                                                version=e.build_id)
    print(f'{published_pipeline.name} is published1')
Пример #28
0
def build_prednet_pipeline(dataset, ws):
    print("building pipeline for dataset %s in workspace %s" %
          (dataset, ws.name))

    base_dir = "."

    def_blob_store = ws.get_default_datastore()

    # folder for scripts that need to be uploaded to Aml compute target
    script_folder = "./scripts"
    os.makedirs(script_folder)

    shutil.copytree(os.path.join(base_dir, "models"),
                    os.path.join(base_dir, script_folder, "models"))
    shutil.copy(os.path.join(base_dir, "train.py"), script_folder)
    shutil.copy(os.path.join(base_dir, "data_preparation.py"), script_folder)
    shutil.copy(os.path.join(base_dir, "register_prednet.py"), script_folder)
    shutil.copy(os.path.join(base_dir, "batch_scoring.py"), script_folder)
    shutil.copy(os.path.join(base_dir, "train_clf.py"), script_folder)
    shutil.copy(os.path.join(base_dir, "register_clf.py"), script_folder)

    cpu_compute_name = args.cpu_compute_name
    cpu_compute_target = AmlCompute(ws, cpu_compute_name)
    print("found existing compute target: %s" % cpu_compute_name)

    # use get_status() to get a detailed status for the current cluster.
    print(cpu_compute_target.get_status().serialize())

    # choose a name for your cluster
    gpu_compute_name = args.gpu_compute_name

    gpu_compute_target = AmlCompute(workspace=ws, name=gpu_compute_name)
    print(gpu_compute_target.get_status().serialize())

    env = Environment.get(ws, "prednet")

    # Runconfigs
    runconfig = RunConfiguration()
    runconfig.environment = env
    print("PipelineData object created")

    # DataReference to where raw data is stored.
    raw_data = DataReference(
        datastore=def_blob_store,
        data_reference_name="raw_data",
        path_on_datastore=os.path.join("prednet", "data", "raw_data"),
    )
    print("DataReference object created")

    # Naming the intermediate data as processed_data and assigning it to the
    # variable processed_data.
    preprocessed_data = PipelineData("preprocessed_data",
                                     datastore=def_blob_store)
    data_metrics = PipelineData("data_metrics", datastore=def_blob_store)
    hd_child_cwd = PipelineData("prednet_model_path", datastore=def_blob_store)
    # prednet_path = PipelineData("outputs", datastore=def_blob_store)
    scored_data = PipelineData("scored_data", datastore=def_blob_store)
    model_path = PipelineData("model_path", datastore=def_blob_store)

    # prepare dataset for training/testing recurrent neural network
    data_prep = PythonScriptStep(
        name="prepare_data",
        script_name="data_preparation.py",
        arguments=[
            "--raw_data",
            raw_data,
            "--preprocessed_data",
            preprocessed_data,
            "--dataset",
            dataset,
        ],
        inputs=[raw_data],
        outputs=[preprocessed_data],
        compute_target=cpu_compute_target,
        source_directory=script_folder,
        runconfig=runconfig,
        allow_reuse=True,
    )
    # data_prep.run_after(video_decoding)

    print("data_prep step created")

    est = Estimator(
        source_directory=script_folder,
        compute_target=gpu_compute_target,
        entry_script="train.py",
        node_count=1,
        environment_definition=env,
    )

    ps = BayesianParameterSampling({
        "--batch_size":
        choice(1, 2, 4, 10),
        "--filter_sizes":
        choice("3, 3, 3", "4, 4, 4", "5, 5, 5"),
        "--stack_sizes":
        choice("48, 96, 192", "36, 72, 144", "12, 24, 48"),
        "--learning_rate":
        uniform(1e-6, 1e-3),
        "--lr_decay":
        uniform(1e-9, 1e-2),
        "--freeze_layers":
        choice("0, 1, 2", "1, 2, 3", "0, 1", "1, 2", "2, 3", "0", "3"),
        # "--fine_tuning": choice("True", "False"),
    })

    hdc = HyperDriveConfig(
        estimator=est,
        hyperparameter_sampling=ps,
        primary_metric_name="val_loss",
        primary_metric_goal=PrimaryMetricGoal.MINIMIZE,
        max_total_runs=3,
        max_concurrent_runs=3,
        max_duration_minutes=60 * 6,
    )

    train_prednet = HyperDriveStep(
        "train_w_hyperdrive",
        hdc,
        estimator_entry_script_arguments=[
            "--preprocessed_data",
            preprocessed_data,
            "--remote_execution",
            "--dataset",
            dataset,
        ],
        inputs=[preprocessed_data],
        outputs=[hd_child_cwd],
        metrics_output=data_metrics,
        allow_reuse=True,
    )
    train_prednet.run_after(data_prep)

    register_prednet = PythonScriptStep(
        name="register_prednet",
        script_name="register_prednet.py",
        arguments=[
            "--data_metrics",
            data_metrics,
        ],
        compute_target=cpu_compute_target,
        inputs=[data_metrics, hd_child_cwd],
        source_directory=script_folder,
        allow_reuse=True,
    )
    register_prednet.run_after(train_prednet)

    batch_scoring = PythonScriptStep(
        name="batch_scoring",
        script_name="batch_scoring.py",
        arguments=[
            "--preprocessed_data",
            preprocessed_data,
            "--scored_data",
            scored_data,
            "--dataset",
            dataset,
            # "--prednet_path",
            # prednet_path
        ],
        compute_target=gpu_compute_target,
        inputs=[preprocessed_data],
        outputs=[scored_data],
        source_directory=script_folder,
        runconfig=runconfig,
        allow_reuse=True,
    )
    batch_scoring.run_after(register_prednet)

    train_clf = PythonScriptStep(
        name="train_clf",
        script_name="train_clf.py",
        arguments=[
            "--preprocessed_data", preprocessed_data, "--scored_data",
            scored_data, "--model_path", model_path
        ],
        compute_target=cpu_compute_target,
        inputs=[preprocessed_data, scored_data],
        outputs=[model_path],
        source_directory=script_folder,
        runconfig=runconfig,
        allow_reuse=True,
    )
    train_clf.run_after(batch_scoring)

    register_clf = PythonScriptStep(
        name="register_clf",
        script_name="register_clf.py",
        arguments=["--model_path", model_path],
        inputs=[model_path],
        compute_target=cpu_compute_target,
        source_directory=script_folder,
        allow_reuse=True,
        runconfig=runconfig,
    )
    register_clf.run_after(train_clf)

    pipeline = Pipeline(
        workspace=ws,
        steps=[
            data_prep,
            train_prednet,
            register_prednet,
            batch_scoring,
            train_clf,
            register_clf,
        ],
    )
    pipeline.validate()

    pipeline_name = "prednet_" + dataset
    published_pipeline = pipeline.publish(name=pipeline_name)

    _ = Schedule.create(
        workspace=ws,
        name=pipeline_name + "_sch",
        pipeline_id=published_pipeline.id,
        experiment_name=pipeline_name,
        datastore=def_blob_store,
        wait_for_provisioning=True,
        description="Datastore scheduler for Pipeline" + pipeline_name,
        path_on_datastore=os.path.join("prednet/data/raw_data", dataset,
                                       "Train"),
        polling_interval=60 * 24,
    )

    published_pipeline.submit(ws, pipeline_name)
Пример #29
0
def main():
    train_file = r"EdwardFry_Microsoft_issueDataset.csv"
    ws = Workspace.from_config()

    # Default datastore
    def_data_store = ws.get_default_datastore()  # Loads config.json

    # Get the blob storage associated with the workspace
    def_blob_store = Datastore(ws, "workspaceblobstore")

    # Get file storage associated with the workspace
    def_file_store = Datastore(ws, "workspacefilestore")

    # Set data input and output
    xyz_phishing_dataset = Dataset.File.from_files([(def_blob_store,
                                                     train_file)])
    output_data1 = OutputFileDatasetConfig(
        destination=(datastore, 'outputdataset/{run-id}'))
    output_data_dataset = output_data1.register_on_complete(
        name='prepared_output_data')

    # Set compute
    compute_name = "aml-compute"
    vm_size = "STANDARD_NC6"
    if compute_name in ws.compute_targets:
        compute_target = ws.compute_targets[compute_name]
        if compute_target and type(compute_target) is AmlCompute:
            print('Found compute target: ' + compute_name)
    else:
        print('Creating a new compute target...')
        provisioning_config = AmlCompute.provisioning_configuration(
            vm_size=vm_size,  # STANDARD_NC6 is GPU-enabled
            min_nodes=0,
            max_nodes=4)
        # create the compute target
        compute_target = ComputeTarget.create(ws, compute_name,
                                              provisioning_config)

        # Can poll for a minimum number of nodes and for a specific timeout.
        # If no min node count is provided it will use the scale settings for the cluster
        compute_target.wait_for_completion(show_output=True,
                                           min_node_count=None,
                                           timeout_in_minutes=20)

        # For a more detailed view of current cluster status, use the 'status' property
        print(compute_target.status.serialize())

    aml_run_config = RunConfiguration()
    # `compute_target` as defined in "Azure Machine Learning compute" section above
    aml_run_config.target = compute_target

    USE_CURATED_ENV = True
    if USE_CURATED_ENV:
        curated_environment = Environment.get(workspace=ws,
                                              name="AzureML-Tutorial")
        aml_run_config.environment = curated_environment
    else:
        aml_run_config.environment.python.user_managed_dependencies = False

        # Add some packages relied on by data prep step
        aml_run_config.environment.python.conda_dependencies = CondaDependencies.create(
            conda_packages=['pandas', 'scikit-learn'],
            pip_packages=['azureml-sdk', 'azureml-dataprep[fuse,pandas]'],
            pin_sdk_version=False)

    dataprep_source_dir = "./dataprep_src"
    entry_point = "prepare.py"
    # `my_dataset` as defined above
    ds_input = xyz_phishing_dataset.as_named_input('input1')

    # `output_data1`, `compute_target`, `aml_run_config` as defined above
    data_prep_step = PythonScriptStep(script_name=entry_point,
                                      source_directory=dataprep_source_dir,
                                      arguments=[
                                          "--input",
                                          ds_input.as_download(), "--output",
                                          output_data1
                                      ],
                                      compute_target=compute_target,
                                      runconfig=aml_run_config,
                                      allow_reuse=True)

    train_source_dir = "./train_src"
    train_entry_point = "train.py"

    training_results = OutputFileDatasetConfig(name="training_results",
                                               destination=def_blob_store)

    train_step = PythonScriptStep(script_name=train_entry_point,
                                  source_directory=train_source_dir,
                                  arguments=[
                                      "--prepped_data",
                                      output_data1.as_input(),
                                      "--training_results", training_results
                                  ],
                                  compute_target=compute_target,
                                  runconfig=aml_run_config,
                                  allow_reuse=True)

    # list of steps to run (`compare_step` definition not shown)
    compare_models = [data_prep_step, train_step, compare_step]

    # Build the pipeline
    pipeline1 = Pipeline(workspace=ws, steps=[compare_models])

    #dataset_consuming_step = PythonScriptStep(
    #    script_name="iris_train.py",
    #    inputs=[iris_tabular_dataset.as_named_input("iris_data")],
    #    compute_target=compute_target,
    #    source_directory=project_folder
    #)

    #run_context = Run.get_context()
    #iris_dataset = run_context.input_datasets['iris_data']
    #dataframe = iris_dataset.to_pandas_dataframe()

    ## Within a PythonScriptStep

    #ws = Run.get_context().experiment.workspace

    #step = PythonScriptStep(name="Hello World",
    #                        script_name="hello_world.py",
    #                        compute_target=aml_compute,
    #                        source_directory=source_directory,
    #                        allow_reuse=False,
    #                        hash_paths=['hello_world.ipynb'])

    # Submit the pipeline to be run
    pipeline_run1 = Experiment(ws, 'Compare_Models_Exp').submit(pipeline1)
    pipeline_run1.wait_for_completion()
def main():
    e = Env()
    # Get Azure machine learning workspace
    aml_workspace = Workspace.get(
        name=e.workspace_name,
        subscription_id=e.subscription_id,
        resource_group=e.resource_group,
    )
    print(f"get_workspace:{aml_workspace}")

    # Get Azure machine learning cluster
    aml_compute = get_compute(aml_workspace, e.compute_name, e.vm_size)
    if aml_compute is not None:
        print(f"aml_compute:{aml_compute}")

    # Create a reusable Azure ML environment
    environment = get_environment(
        aml_workspace,
        e.aml_env_name,
        conda_dependencies_file=e.aml_env_train_conda_dep_file,
        create_new=e.rebuild_env,
    )  #
    run_config = RunConfiguration()
    run_config.environment = environment

    if e.datastore_name:
        datastore_name = e.datastore_name
    else:
        datastore_name = aml_workspace.get_default_datastore().name
    run_config.environment.environment_variables[
        "DATASTORE_NAME"] = datastore_name  # NOQA: E501

    # datastore and dataset names are fixed for this pipeline, however
    # data_file_path can be specified for registering new versions of dataset
    # Note that AML pipeline parameters don't take empty string as default, "" won't work  # NOQA: E501
    model_name_param = PipelineParameter(
        name="model_name", default_value=e.model_name)  # NOQA: E501
    data_file_path_param = PipelineParameter(
        name="data_file_path", default_value="nopath")  # NOQA: E501
    ml_params = PipelineParameter(name="ml_params",
                                  default_value="default")  # NOQA: E501

    # Create a PipelineData to pass data between steps
    pipeline_data = PipelineData(
        "pipeline_data", datastore=aml_workspace.get_default_datastore())

    train_step = PythonScriptStep(
        name="Train Model",
        script_name="train/train_aml.py",
        compute_target=aml_compute,
        source_directory=e.sources_directory_train,
        outputs=[pipeline_data],
        arguments=[
            "--model_name",
            model_name_param,
            "--step_output",
            pipeline_data,
            "--data_file_path",
            data_file_path_param,
            "--dataset_name",
            e.processed_dataset_name,
            "--datastore_name",
            datastore_name,
            "--ml_params",
            ml_params,
        ],
        runconfig=run_config,
        allow_reuse=True,
    )
    print("Step Train created")

    evaluate_step = PythonScriptStep(
        name="Evaluate Model ",
        script_name="evaluate/evaluate_model.py",
        compute_target=aml_compute,
        source_directory=e.sources_directory_train,
        arguments=[
            "--model_name",
            model_name_param,
            "--ml_params",
            ml_params,
        ],
        runconfig=run_config,
        allow_reuse=False,
    )
    print("Step Evaluate created")

    register_step = PythonScriptStep(
        name="Register Model ",
        script_name="register/register_model.py",
        compute_target=aml_compute,
        source_directory=e.sources_directory_train,
        inputs=[pipeline_data],
        arguments=[
            "--model_name",
            model_name_param,
            "--step_input",
            pipeline_data,
            "--ml_params",
            ml_params,
        ],
        runconfig=run_config,
        allow_reuse=False,
    )
    print("Step Register created")

    evaluate_step.run_after(train_step)
    register_step.run_after(evaluate_step)
    steps = [train_step, evaluate_step, register_step]

    train_pipeline = Pipeline(workspace=aml_workspace, steps=steps)
    train_pipeline._set_experiment_name
    train_pipeline.validate()
    published_pipeline = train_pipeline.publish(
        name=e.training_pipeline_name,
        description="Model training/retraining pipeline",
        version=e.build_id,
    )
    print(f"Published pipeline: {published_pipeline.name}")
    print(f"for build {published_pipeline.version}")