Exemplo n.º 1
0
pipeline_path = "Azure_ML/03_pipeline/"

# - Crawl Data
crawl_data_dir = PipelineData(
    "extracted_data",
    is_directory=True,
)
crawl_data_step = PythonScriptStep(
    name="Crawl Data",
    script_name=pipeline_path + "01_crawl_data/main.py",
    source_directory='.',
    compute_target=compute_target,
    runconfig=run_config,
    outputs=[crawl_data_dir],
    arguments=[
        "--output-dir",
        crawl_data_dir,
        "--sql-name-in",
        sql_name,
        "--sql-pw-in",
        sql_pw,
    ],
    allow_reuse=False,
)

# - Clean Crawled Data
clean_data_dir = PipelineData(
    "extracted_data",
    is_directory=True,
)
clean_data_step = PythonScriptStep(
Exemplo n.º 2
0
    input_data = input_dataset.as_named_input('input_dataset').as_mount()

    data_store = ws.get_default_datastore()
    prepped_data = PipelineData('prepped_data', datastore=data_store)

    pipeline_mode_param = PipelineParameter(name="mode",
                                            default_value="execute")

    prep_step = PythonScriptStep(
        name='Prepare data',
        source_directory=script_folder,
        script_name='prep_data.py',
        compute_target=compute_target,
        runconfig=pipeline_run_config,
        # Specify dataset as initial input
        inputs=[input_data],
        # Specify PipelineData as output
        outputs=[prepped_data],
        # Also pass as data reference to script
        arguments=[
            '--input_data', input_data, '--prepped_data', prepped_data,
            '--mode', pipeline_mode_param
        ],
        allow_reuse=False)

    # Construct the pipeline
    pipeline_steps = [prep_step]
    #pipeline_steps = [step_test]
    pipeline = Pipeline(workspace=ws, steps=pipeline_steps)
    print("Pipeline is built.")

    # Create an experiment and run the pipeline
Exemplo n.º 3
0
    # create the cluster
    CPU_compute_target = ComputeTarget.create(ws, cpu_cluster_name, compute_config)

    # can poll for a minimum number of nodes and for a specific timeout. 
    # if no min node count is provided it uses the scale settings for the cluster
    CPU_compute_target.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=10)

# use get_status() to get a detailed status for the current cluster. 
print(CPU_compute_target.get_status().serialize())

#######################################################################################################

register_step = PythonScriptStep(name = "register_step",
                    script_name= "register/estimator_register.py",
                    runconfig = run_config_user_managed,
                    source_directory = './scripts',
                    compute_target=CPU_compute_target 
                    )


#######################################################################################################
pipeline = Pipeline(workspace = ws,steps=[register_step])

#Validate pipeline
pipeline.validate()
print("Pipeline validation complete")

#submit Pipeline
run = exp.submit(pipeline,pipeline_parameters={})
print("Pipeline is submitted for execution")
Exemplo n.º 4
0
def main():
    e = Env()
    # Get Azure machine learning workspace
    aml_workspace = Workspace.get(
        name=e.workspace_name,
        subscription_id=e.subscription_id,
        resource_group=e.resource_group,
    )
    print("get_workspace:")
    print(aml_workspace)

    # Get Azure machine learning cluster
    aml_compute = get_compute(aml_workspace, e.compute_name, e.vm_size)
    if aml_compute is not None:
        print("aml_compute:")
        print(aml_compute)

    # Create a reusable Azure ML environment
    environment = get_environment(
        aml_workspace,
        e.aml_env_name,
        conda_dependencies_file=e.aml_env_train_conda_dep_file,
        create_new=e.rebuild_env,
    )  #
    run_config = RunConfiguration()
    run_config.environment = environment

    if e.datastore_name:
        datastore_name = e.datastore_name
    else:
        datastore_name = aml_workspace.get_default_datastore().name
    run_config.environment.environment_variables[
        "DATASTORE_NAME"] = datastore_name  # NOQA: E501

    model_name_param = PipelineParameter(
        name="model_name", default_value=e.model_name)  # NOQA: E501
    dataset_version_param = PipelineParameter(name="dataset_version",
                                              default_value=e.dataset_version)
    data_file_path_param = PipelineParameter(name="data_file_path",
                                             default_value="none")
    caller_run_id_param = PipelineParameter(name="caller_run_id",
                                            default_value="none")  # NOQA: E501

    # Get dataset name
    dataset_name = e.dataset_name

    # Check to see if dataset exists
    if dataset_name not in aml_workspace.datasets:
        # This call creates an example CSV from sklearn sample data. If you
        # have already bootstrapped your project, you can comment this line
        # out and use your own CSV.
        create_sample_data_csv()

        # Use a CSV to read in the data set.
        file_name = "diabetes.csv"

        if not os.path.exists(file_name):
            raise Exception(
                'Could not find CSV dataset at "%s". If you have bootstrapped your project, you will need to provide a CSV.'  # NOQA: E501
                % file_name)  # NOQA: E501

        # Upload file to default datastore in workspace
        datatstore = Datastore.get(aml_workspace, datastore_name)
        target_path = "training-data/"
        datatstore.upload_files(
            files=[file_name],
            target_path=target_path,
            overwrite=True,
            show_progress=False,
        )

        # Register dataset
        path_on_datastore = os.path.join(target_path, file_name)
        dataset = Dataset.Tabular.from_delimited_files(
            path=(datatstore, path_on_datastore))
        dataset = dataset.register(
            workspace=aml_workspace,
            name=dataset_name,
            description="diabetes training data",
            tags={"format": "CSV"},
            create_new_version=True,
        )

    # Create a PipelineData to pass data between steps
    pipeline_data = PipelineData(
        "pipeline_data", datastore=aml_workspace.get_default_datastore())

    train_step = PythonScriptStep(
        name="Train Model",
        script_name=e.train_script_path,
        compute_target=aml_compute,
        source_directory=e.sources_directory_train,
        outputs=[pipeline_data],
        arguments=[
            "--model_name",
            model_name_param,
            "--step_output",
            pipeline_data,
            "--dataset_version",
            dataset_version_param,
            "--data_file_path",
            data_file_path_param,
            "--caller_run_id",
            caller_run_id_param,
            "--dataset_name",
            dataset_name,
        ],
        runconfig=run_config,
        allow_reuse=True,
    )
    print("Step Train created")

    evaluate_step = PythonScriptStep(
        name="Evaluate Model ",
        script_name=e.evaluate_script_path,
        compute_target=aml_compute,
        source_directory=e.sources_directory_train,
        arguments=[
            "--model_name",
            model_name_param,
            "--allow_run_cancel",
            e.allow_run_cancel,
        ],
        runconfig=run_config,
        allow_reuse=False,
    )
    print("Step Evaluate created")

    register_step = PythonScriptStep(
        name="Register Model ",
        script_name=e.register_script_path,
        compute_target=aml_compute,
        source_directory=e.sources_directory_train,
        inputs=[pipeline_data],
        arguments=[
            "--model_name",
            model_name_param,
            "--step_input",
            pipeline_data,
        ],  # NOQA: E501
        runconfig=run_config,
        allow_reuse=False,
    )
    print("Step Register created")
    # Check run_evaluation flag to include or exclude evaluation step.
    if (e.run_evaluation).lower() == "true":
        print("Include evaluation step before register step.")
        evaluate_step.run_after(train_step)
        register_step.run_after(evaluate_step)
        steps = [train_step, evaluate_step, register_step]
    else:
        print("Exclude evaluation step and directly run register step.")
        register_step.run_after(train_step)
        steps = [train_step, register_step]

    train_pipeline = Pipeline(workspace=aml_workspace, steps=steps)
    train_pipeline._set_experiment_name
    train_pipeline.validate()
    published_pipeline = train_pipeline.publish(
        name=e.pipeline_name,
        description="Model training/retraining pipeline",
        version=e.build_id,
    )
    print(f'This is a great demo!')
    print(f"Published pipeline: {published_pipeline.name}")
    print(f"for build {published_pipeline.version}")
Exemplo n.º 5
0
def main():
    """Build pipeline."""
    # Environment variables
    env = Env()

    # Azure ML workspace
    aml_workspace = Workspace.get(
        name=env.workspace_name,
        subscription_id=env.subscription_id,
        resource_group=env.resource_group,
    )
    logger.info(f"Azure ML workspace: {aml_workspace}")

    # Azure ML compute cluster
    aml_compute = get_compute(aml_workspace, env.compute_name)
    logger.info(f"Aazure ML compute cluster: {aml_compute}")

    # Azure ML environment
    environment = Environment(name=env.aml_env_name)
    conda_dep = CondaDependencies(
        conda_dependencies_file_path="./local_development/dev_dependencies.yml"
    )
    environment.python.conda_dependencies = conda_dep

    run_config = RunConfiguration()
    run_config.environment = environment

    # Pipeline Data
    preparation_pipelinedata = PipelineData("preparation_pipelinedata",
                                            is_directory=True).as_dataset()
    extraction_pipelinedata = PipelineData("extraction_pipelinedata",
                                           is_directory=True)
    training_pipelinedata = PipelineData("training_pipelinedata",
                                         is_directory=True)

    # List of pipeline steps
    step_list = list()
    preparation_step = PythonScriptStep(
        name="preparation-step",
        compute_target=aml_compute,
        source_directory=env.sources_directory_train,
        script_name=env.preparation_step_script_path,
        outputs=[preparation_pipelinedata],
        arguments=[
            "--input_path", env.input_dir, "--output_path",
            preparation_pipelinedata, "--datastore_name",
            env.blob_datastore_name
        ],
        runconfig=run_config)

    step_list.append(preparation_step)

    parallel_run_config = ParallelRunConfig(
        source_directory=env.sources_directory_train,
        entry_script=env.extraction_step_script_path,
        mini_batch_size=env.mini_batch_size,
        error_threshold=env.error_threshold,
        output_action="append_row",
        environment=environment,
        compute_target=aml_compute,
        node_count=env.node_count,
        run_invocation_timeout=env.run_invocation_timeout,
        process_count_per_node=env.process_count_per_node,
        append_row_file_name="extraction_output.txt")

    extraction_step = ParallelRunStep(
        name="extraction-step",
        inputs=[preparation_pipelinedata],
        output=extraction_pipelinedata,
        arguments=["--output_dir", extraction_pipelinedata],
        parallel_run_config=parallel_run_config)
    step_list.append(extraction_step)

    training_step = PythonScriptStep(
        name="traning-step",
        compute_target=aml_compute,
        source_directory=env.sources_directory_train,
        script_name=env.training_step_script_path,
        inputs=[extraction_pipelinedata],
        outputs=[training_pipelinedata],
        arguments=[
            "--input_dir", extraction_pipelinedata, "--output_dir",
            training_pipelinedata
        ],
        runconfig=run_config)

    step_list.append(training_step)

    # Build pipeline
    pipeline = Pipeline(workspace=aml_workspace, steps=step_list)
    pipeline.validate()
    logger.info(f"Built pipeline {pipeline}")

    # Publish pipeline
    published_pipeline = pipeline.publish(
        env.pipeline_name,
        description=env.pipeline_name,
        version=datetime.utcnow().isoformat())
    try:
        pipeline_endpoint = PipelineEndpoint.get(
            workspace=aml_workspace, name=env.pipeline_endpoint_name)
        pipeline_endpoint.add_default(published_pipeline)
    except ErrorResponseException:
        pipeline_endpoint = PipelineEndpoint.publish(
            workspace=aml_workspace,
            name=env.pipeline_endpoint_name,
            pipeline=published_pipeline,
            description=env.pipeline_endpoint_name)
def main():
    load_dotenv()
    workspace_name = os.environ.get("AML_WORKSPACE_NAME")
    resource_group = os.environ.get("RESOURCE_GROUP")
    subscription_id = os.environ.get("SUBSCRIPTION_ID")
    tenant_id = os.environ.get("TENANT_ID")
    app_id = os.environ.get("SP_APP_ID")
    app_secret = os.environ.get("SP_APP_SECRET")
    sources_directory_train = os.environ.get("SOURCES_DIR_TRAIN")
    train_script_path = os.environ.get("TRAIN_SCRIPT_PATH")
    evaluate_script_path = os.environ.get("EVALUATE_SCRIPT_PATH")
    register_script_path = os.environ.get("REGISTER_SCRIPT_PATH")
    vm_size_cpu = os.environ.get("AML_COMPUTE_CLUSTER_CPU_SKU")
    compute_name_cpu = os.environ.get("AML_COMPUTE_CLUSTER_NAME")
    model_name = os.environ.get("MODEL_NAME")

    # Get Azure machine learning workspace
    aml_workspace = get_workspace(workspace_name, resource_group,
                                  subscription_id, tenant_id, app_id,
                                  app_secret)
    print(aml_workspace)

    # Get Azure machine learning cluster
    aml_compute_cpu = get_compute(aml_workspace, compute_name_cpu, vm_size_cpu)
    if aml_compute_cpu is not None:
        print(aml_compute_cpu)

    run_config = RunConfiguration(conda_dependencies=CondaDependencies.create(
        conda_packages=[
            'numpy', 'pandas', 'scikit-learn', 'tensorflow', 'keras'
        ],
        pip_packages=[
            'azure', 'azureml-core', 'azure-storage', 'azure-storage-blob'
        ]))
    run_config.environment.docker.enabled = True

    model_name = PipelineParameter(name="model_name", default_value=model_name)
    def_blob_store = Datastore(aml_workspace, "workspaceblobstore")
    jsonconfigs = PipelineData("jsonconfigs", datastore=def_blob_store)
    config_suffix = datetime.datetime.now().strftime("%Y%m%d%H")

    train_step = PythonScriptStep(
        name="Train Model",
        script_name=train_script_path,
        compute_target=aml_compute_cpu,
        source_directory=sources_directory_train,
        arguments=[
            "--config_suffix",
            config_suffix,
            "--json_config",
            jsonconfigs,
            "--model_name",
            model_name,
        ],
        runconfig=run_config,
        # inputs=[jsonconfigs],
        outputs=[jsonconfigs],
        allow_reuse=False,
    )
    print("Step Train created")

    evaluate_step = PythonScriptStep(
        name="Evaluate Model ",
        script_name=evaluate_script_path,
        compute_target=aml_compute_cpu,
        source_directory=sources_directory_train,
        arguments=[
            "--config_suffix",
            config_suffix,
            "--json_config",
            jsonconfigs,
        ],
        runconfig=run_config,
        inputs=[jsonconfigs],
        # outputs=[jsonconfigs],
        allow_reuse=False,
    )
    print("Step Evaluate created")

    register_model_step = PythonScriptStep(
        name="Register New Trained Model",
        script_name=register_script_path,
        compute_target=aml_compute_cpu,
        source_directory=sources_directory_train,
        arguments=[
            "--config_suffix",
            config_suffix,
            "--json_config",
            jsonconfigs,
            "--model_name",
            model_name,
        ],
        runconfig=run_config,
        inputs=[jsonconfigs],
        # outputs=[jsonconfigs],
        allow_reuse=False,
    )
    print("Step register model created")

    evaluate_step.run_after(train_step)
    register_model_step.run_after(evaluate_step)
    steps = [register_model_step]

    train_pipeline = Pipeline(workspace=aml_workspace, steps=steps)
    train_pipeline.validate()
    published_pipeline = train_pipeline.publish(
        name="training-pipeline",
        description="Model training/retraining pipeline")

    train_pipeline_json = {}
    train_pipeline_json["rest_endpoint"] = published_pipeline.endpoint
    json_file_path = "ml_service/pipelines/train_pipeline.json"
    with open(json_file_path, "w") as outfile:
        json.dump(train_pipeline_json, outfile)
run_amlcompute.environment.python.conda_dependencies = CondaDependencies.create(
    pip_packages=[
        'numpy', 'pandas', 'scikit-learn==0.20.3', 'sklearn_pandas',
        'azureml-sdk'
    ])

scripts_folder = 'scripts'
def_blob_store = ws.get_default_datastore()

train_output = PipelineData('train_output', datastore=def_blob_store)
print("train_output PipelineData object created")

trainStep = PythonScriptStep(name="train",
                             script_name="train.py",
                             arguments=["--model_name", args.model_name],
                             compute_target=aml_compute,
                             runconfig=run_amlcompute,
                             source_directory=scripts_folder,
                             allow_reuse=False)
print("trainStep created")

evaluate_output = PipelineData('evaluate_output', datastore=def_blob_store)

evaluateStep = PythonScriptStep(name="evaluate",
                                script_name="evaluate.py",
                                arguments=[
                                    "--model_name", args.model_name,
                                    "--metric_threshold",
                                    float(args.metric_threshold),
                                    "--image_name", args.image_name,
                                    "--output", evaluate_output
Exemplo n.º 8
0
datastore = Datastore.get(ws, "xray_datastore")

PreProcessingData = PipelineData("PreProcessingData", datastore=datastore)
ModelData = PipelineData("ModelData", datastore=datastore)
#######################################################################################################
preprocessing_step = PythonScriptStep(
    name="preprocessing_step",
    script_name="estimator_data_preprocessing.py",
    compute_target=GPU_compute_target,
    runconfig=run_config_user_managed,
    source_directory='./scripts/data_preprocess',
    inputs=[
        xrayimage_dataset.as_named_input('xrayimage_dataset').as_mount(
            '/temp/xray_images'),
        traindata_dataset.as_named_input('traindata_dataset'),
        validdata_dataset.as_named_input('validdata_dataset'),
        testdata_dataset.as_named_input('testdata_dataset'),
        traintarget_dataset.as_named_input('traintarget_dataset'),
        validtarget_dataset.as_named_input('validtarget_dataset'),
        testtarget_dataset.as_named_input('testtarget_dataset')
    ],
    arguments=['--PreProcessingData', PreProcessingData],
    outputs=[PreProcessingData],
    allow_reuse=True)

print("preprocessing_step")

#######################################################################################################

est = TensorFlow(source_directory='./scripts/train',
Exemplo n.º 9
0
env.register(workspace=ws)
print("Registered environment component-condition")

# Specify the run configuration
run_config = RunConfiguration()
run_config.environment.docker.enabled = True
run_config.environment.python.conda_dependencies = cd

# Pipeline definition
inputdata = DataReference(datastore=Datastore.get(ws, "trainingdata"),
                          data_reference_name="data")

train_model = PythonScriptStep(
    script_name="./train.py",
    name="fit-nlp-model",
    inputs=[inputdata.as_download(path_on_compute="./data")],
    runconfig=run_config,
    compute_target=compute_target,
)

pipeline = Pipeline(
    workspace=ws,
    steps=[train_model],
    description="Builds Keras model for detecting component defects",
)

if __name__ == "__main__":
    Experiment(
        ws,
        "fit-component-defects-model").submit(pipeline).wait_for_completion(
            show_output=True)
Exemplo n.º 10
0
def evaluate_step(datastore, test_dir, model_dir, compute_target):
    '''
    This step will take the raw data downloaded from the previous step,
    preprocess it, and split into train, valid, and test directories.
    
    :param datastore: The datastore that will be used
    :type datastore: Datastore
    :param test_dir: The reference to the directory containing the test data
    :type test_dir: DataReference
    :param model_dir: The reference to the directory containing the NMT model
    :type model_dir: DataReference
    :param compute_target: The compute target to run the step on
    :type compute_target: ComputeTarget
    
    :return: The evaluate step, step outputs dictionary (keys: eval_dir)
    :rtype: PythonScriptStep, dict
    '''

    run_config = RunConfiguration()
    run_config.environment.docker.enabled = True
    run_config.environment.docker.base_image = DEFAULT_GPU_IMAGE
    run_config.environment.python.user_managed_dependencies = False
    conda_packages = ['pytorch', 'tqdm', 'nltk']
    run_config.environment.python.conda_dependencies = CondaDependencies.create(
        conda_packages=conda_packages
        )

    # set hyperparameters of the model training step
    input_col = PipelineParameter(name='input_col', default_value='Title')
    output_col = PipelineParameter(name='output_col', default_value='Abstract')
    cuda = PipelineParameter(name='cuda', default_value=True)
    seed = PipelineParameter(name='seed', default_value=0)
    beam_size = PipelineParameter(name='beam_size', default_value=5)
    max_decoding_time_step = PipelineParameter(name='max_decoding_time_step', default_value=70)


    eval_dir = PipelineData(
        name='eval_dir',
        pipeline_output_name='eval_dir',
        datastore=datastore,
        output_mode='mount',
        is_directory=True)

    outputs = [eval_dir]
    outputs_map = { 
        'eval_dir': eval_dir,
    }

    step = PythonScriptStep(
        name="Evaluate",
        script_name='evaluate.py',
        arguments=[
            '--test_dir', test_dir,
            '--model_dir', model_dir,
            '--input_col', input_col,
            '--output_col', output_col,
            '--cuda', cuda,
            '--seed', seed,
            '--beam_size', beam_size,
            '--max_decoding_time_step', max_decoding_time_step,
            '--eval_dir', eval_dir
        ],
        inputs=[test_dir, model_dir],
        outputs=outputs,
        compute_target=compute_target,
        runconfig=run_config,
        source_directory=os.path.dirname(os.path.abspath(__file__)),
        allow_reuse=True
    )

    return step, outputs_map
Exemplo n.º 11
0
# Review and run the cell below to construct the PipelineData objects and the PythonScriptStep pipeline step:
# 
# *Open preprocess.py in the local machine and examine the arguments, inputs, and outputs for the script. Note that there is an argument called process_mode to distinguish between processing training data vs test data. Reviewing the Python script file will give you a good sense of why the script argument names used below are important.*

# In[ ]:


processed_train_data = PipelineData('processed_train_data', datastore=def_blob_store)
print("PipelineData object created")

processTrainDataStep = PythonScriptStep(
    name="process_train_data",
    script_name="preprocess.py", 
    arguments=["--process_mode", 'train',
               "--input", raw_train_data,
               "--output", processed_train_data],
    inputs=[raw_train_data],
    outputs=[processed_train_data],
    compute_target=aml_compute,
    runconfig=run_amlcompute,
    source_directory=project_folder
)
print("preprocessStep created")


# ### Create the Train Pipeline Step

# The train pipeline step takes the *processed_train_data* created in the above step as input and generates another PipelineData object to save the *trained_model* as its output. This is an example of how machine learning pipelines can have many steps and these steps could use or reuse datasources and intermediate data.
# 
# *Open train.py in the local machine and examine the arguments, inputs, and outputs for the script.*

### Challenge Task
Exemplo n.º 12
0
ws = get_workspace(config)

compute_target = get_or_create_compute(ws, **config["compute"])

###
# Define and set up pipeline
###

pipeline_param = PipelineParameter(name="my_arg", default_value="default")

my_step = PythonScriptStep(
    name="My Script Step",
    script_name="scriptstep.py",
    arguments=[pipeline_param],
    inputs=[],
    outputs=[],
    compute_target=compute_target,
    source_directory="src",
    allow_reuse=True,
    runconfig=RunConfiguration(conda_dependencies=CondaDependencies(
        conda_dependencies_file_path="environment.yml")),
)

pipeline_id, pipeline_endpoint = publish_pipeline(ws, [my_step], "blabla")

###
# Trigger pipeline via REST API
###

# To trigger the pipeline, a service principal is required: https://docs.microsoft.com/en-us/azure/machine-learning/how-to-setup-authentication

token = requests.post(
def main():
    load_dotenv()
    workspace_name = os.environ.get("WS_NAME")
    resource_group = os.environ.get("RG_NAME")
    subscription_id = os.environ.get("SUBSCRIPTION_ID")
    tenant_id = os.environ.get("TENANT_ID")
    app_id = os.environ.get("SP_APP_ID")
    app_secret = os.environ.get("SP_APP_SECRET")
    sources_directory_train = os.environ.get("SOURCES_DIR_TRAIN")
    train_script_path = os.environ.get("TRAIN_SCRIPT_PATH")
    evaluate_script_path = os.environ.get("EVALUATE_SCRIPT_PATH")
    vm_size = os.environ.get("AML_COMPUTE_CLUSTER_SKU")
    compute_name = os.environ.get("AML_COMPUTE_CLUSTER_NAME")
    aks_name = os.environ.get("AKS_CLUSTER_NAME")
    model_name = os.environ.get("MODEL_NAME")
    build_id = os.environ.get("BUILD_BUILDID")
    pipeline_name = os.environ.get("TRAINING_PIPELINE_NAME")
    experiment_name = os.environ.get("EXPERIMENT_NAME")

    # Get Azure machine learning workspace
    aml_workspace = get_workspace(
        workspace_name,
        resource_group,
        subscription_id,
        tenant_id,
        app_id,
        app_secret)

    print('Now accessing:')
    print(aml_workspace)

    # Get Azure machine learning cluster
    aml_compute = get_compute(
        aml_workspace,
        compute_name,
        vm_size)
    if aml_compute is not None:
        print(aml_compute)

    run_config = RunConfiguration(conda_dependencies=CondaDependencies.create(
        conda_packages=['numpy', 'pandas',
                        'scikit-learn', 'keras'],
        pip_packages=['azureml-core==1.25.0',
                      'azureml-defaults==1.25.0',
                      'azureml-telemetry==1.25.0',
                      'azureml-train-restclients-hyperdrive==1.25.0',
                      'azureml-train-core==1.25.0',
                      'azureml-dataprep',
                      'tensorflow-gpu==2.0.0',
                      'transformers==2.0.0',
                      'absl-py',
                      'azureml-dataprep',
                      'h5py<3.0.0'])
    )
    # run_config.environment.docker.enabled = True

    datastore_name = 'mtcseattle'
    container_name = 'azure-service-classifier'
    account_name = 'mtcseattle'
    sas_token = '?sv=2020-04-08&st=2021-05-26T04%3A39%3A46Z&se=2022-05-27T04%3A39%3A00Z&sr=c&sp=rl&sig=CTFMEu24bo2X06G%2B%2F2aKiiPZBzvlWHELe15rNFqULUk%3D'

    try:
        existing_datastore = Datastore.get(aml_workspace, datastore_name)
    except:  # noqa: E722
        existing_datastore = Datastore \
            .register_azure_blob_container(workspace=aml_workspace,
                                           datastore_name=datastore_name,
                                           container_name=container_name,
                                           account_name=account_name,
                                           sas_token=sas_token,
                                           overwrite=True)

    azure_dataset = Dataset.File.from_files(
        path=(existing_datastore, 'data'))

    azure_dataset = azure_dataset.register(
        workspace=aml_workspace,
        name='Azure Services Dataset',
        description='Dataset containing azure related posts on Stackoverflow',
        create_new_version=True)

    azure_dataset.to_path()
    input_data = azure_dataset.as_named_input('azureservicedata').as_mount(
        '/tmp/data')

    model_name = PipelineParameter(
        name="model_name", default_value=model_name)
    max_seq_length = PipelineParameter(
        name="max_seq_length", default_value=128)
    learning_rate = PipelineParameter(
        name="learning_rate", default_value=3e-5)
    num_epochs = PipelineParameter(
        name="num_epochs", default_value=1)
    export_dir = PipelineParameter(
        name="export_dir", default_value="./outputs/model")
    batch_size = PipelineParameter(
        name="batch_size", default_value=32)
    steps_per_epoch = PipelineParameter(
        name="steps_per_epoch", default_value=1)

    # initialize the PythonScriptStep
    train_step = PythonScriptStep(
        name='Train Model',
        script_name=train_script_path,
        arguments=['--data_dir', input_data,
                   '--max_seq_length', max_seq_length,
                   '--batch_size', batch_size,
                   '--learning_rate', learning_rate,
                   '--steps_per_epoch', steps_per_epoch,
                   '--num_epochs', num_epochs,
                   '--export_dir',export_dir],
        compute_target=aml_compute,
        source_directory=sources_directory_train,
        runconfig=run_config,
        allow_reuse=True)
    print("Step Train created")

    evaluate_step = PythonScriptStep(
        name="Evaluate Model ",
        script_name=evaluate_script_path,
        compute_target=aml_compute,
        source_directory=sources_directory_train,
        arguments=[
            "--model_name", model_name,
            "--build_id", build_id,
        ],
        runconfig=run_config,
        allow_reuse=False,
    )
    print("Step Evaluate created")

    # Currently, the Evaluate step will automatically register
    # the model if it performs better. This step is based on a
    # previous version of the repo which utilized JSON files to
    # track evaluation results.

    evaluate_step.run_after(train_step)
    steps = [evaluate_step]

    train_pipeline = Pipeline(workspace=aml_workspace, steps=steps)
    train_pipeline.validate()
    published_pipeline = train_pipeline.publish(
        name=pipeline_name,
        description="Model training/retraining pipeline.",
        version=build_id
    )
    print(f'Published pipeline: {published_pipeline.name}')
    print(f'for build {published_pipeline.version}')

    response = published_pipeline.submit(  # noqa: F841
               workspace=aml_workspace,
               experiment_name=experiment_name)
Exemplo n.º 14
0
def main():
    e = Env()
    # Get Azure machine learning workspace
    aml_workspace = Workspace.get(name=e.workspace_name,
                                  subscription_id=e.subscription_id,
                                  resource_group=e.resource_group)
    print("get_workspace:")
    print(aml_workspace)

    # Get Azure machine learning cluster
    aml_compute = get_compute(aml_workspace, e.compute_name, e.vm_size)
    if aml_compute is not None:
        print("aml_compute:")
        print(aml_compute)

    run_config = RunConfiguration(conda_dependencies=CondaDependencies.create(
        conda_packages=[
            'numpy', 'pandas', 'scikit-learn', 'tensorflow', 'keras'
        ],
        pip_packages=[
            'azure', 'azureml-core', 'azure-storage', 'azure-storage-blob'
        ]))
    run_config.environment.docker.enabled = True

    config_envvar = {}
    if (e.collection_uri is not None and e.teamproject_name is not None):
        builduri_base = e.collection_uri + e.teamproject_name
        builduri_base = builduri_base + "/_build/results?buildId="
        config_envvar["BUILDURI_BASE"] = builduri_base
    run_config.environment.environment_variables = config_envvar

    model_name_param = PipelineParameter(name="model_name",
                                         default_value=e.model_name)
    build_id_param = PipelineParameter(name="build_id",
                                       default_value=e.build_id)
    hyperparameter_alpha_param = PipelineParameter(name="hyperparameter_alpha",
                                                   default_value=0.5)

    train_step = PythonScriptStep(
        name="Train Model",
        script_name=e.train_script_path,
        compute_target=aml_compute,
        source_directory=e.sources_directory_train,
        arguments=[
            "--build_id",
            build_id_param,
            "--model_name",
            model_name_param,
            "--alpha",
            hyperparameter_alpha_param,
        ],
        runconfig=run_config,
        allow_reuse=False,
    )
    print("Step Train created")

    evaluate_step = PythonScriptStep(
        name="Evaluate Model ",
        script_name=e.evaluate_script_path,
        compute_target=aml_compute,
        source_directory=e.sources_directory_train,
        arguments=[
            "--build_id",
            build_id_param,
            "--model_name",
            model_name_param,
        ],
        runconfig=run_config,
        allow_reuse=False,
    )
    print("Step Evaluate created")

    register_step = PythonScriptStep(
        name="Register Model ",
        script_name=e.register_script_path,
        compute_target=aml_compute,
        source_directory=e.sources_directory_train,
        arguments=[
            "--build_id",
            build_id_param,
            "--model_name",
            model_name_param,
        ],
        runconfig=run_config,
        allow_reuse=False,
    )
    print("Step Register created")

    evaluate_step.run_after(train_step)
    register_step.run_after(evaluate_step)
    steps = [train_step, evaluate_step, register_step]

    train_pipeline = Pipeline(workspace=aml_workspace, steps=steps)
    train_pipeline._set_experiment_name
    train_pipeline.validate()
    published_pipeline = train_pipeline.publish(
        name=e.pipeline_name,
        description="Model training/retraining pipeline",
        version=e.build_id)
    print(f'Published pipeline: {published_pipeline.name}')
    print(f'for build {published_pipeline.version}')
Exemplo n.º 15
0
def main():
    load_dotenv()
    workspace_name = os.environ.get("BASE_NAME") + "-AML-WS"
    resource_group = os.environ.get("BASE_NAME") + "-AML-RG"
    subscription_id = os.environ.get("SUBSCRIPTION_ID")
    tenant_id = os.environ.get("TENANT_ID")
    app_id = os.environ.get("SP_APP_ID")
    app_secret = os.environ.get("SP_APP_SECRET")
    sources_directory_train = os.environ.get("SOURCES_DIR_TRAIN")
    train_script_path = os.environ.get("TRAIN_SCRIPT_PATH")
    evaluate_script_path = os.environ.get("EVALUATE_SCRIPT_PATH")
    # register_script_path = os.environ.get("REGISTER_SCRIPT_PATH")
    vm_size = os.environ.get("AML_COMPUTE_CLUSTER_CPU_SKU")
    compute_name = os.environ.get("AML_COMPUTE_CLUSTER_NAME")
    model_name = os.environ.get("MODEL_NAME")
    build_id = os.environ.get("BUILD_BUILDID")
    pipeline_name = os.environ.get("TRAINING_PIPELINE_NAME")

    print(app_secret)

    # Get Azure machine learning workspace
    aml_workspace = get_workspace(workspace_name, resource_group,
                                  subscription_id, tenant_id, app_id,
                                  app_secret)
    print(aml_workspace)

    # Get Azure machine learning cluster
    aml_compute = get_compute(aml_workspace, compute_name, vm_size)
    if aml_compute is not None:
        print(aml_compute)

    run_config = RunConfiguration(conda_dependencies=CondaDependencies.create(
        conda_packages=[
            'numpy', 'pandas', 'scikit-learn', 'tensorflow', 'keras'
        ],
        pip_packages=[
            'azure', 'azureml-core', 'azure-storage', 'azure-storage-blob'
        ]))
    run_config.environment.docker.enabled = True

    model_name = PipelineParameter(name="model_name", default_value=model_name)
    release_id = PipelineParameter(name="release_id", default_value="0")

    train_step = PythonScriptStep(
        name="Train Model",
        script_name=train_script_path,
        compute_target=aml_compute,
        source_directory=sources_directory_train,
        arguments=[
            "--release_id",
            release_id,
            "--model_name",
            model_name,
        ],
        runconfig=run_config,
        allow_reuse=False,
    )
    print("Step Train created")

    evaluate_step = PythonScriptStep(
        name="Evaluate Model ",
        script_name=evaluate_script_path,
        compute_target=aml_compute,
        source_directory=sources_directory_train,
        arguments=[
            "--release_id",
            release_id,
            "--model_name",
            model_name,
        ],
        runconfig=run_config,
        allow_reuse=False,
    )
    print("Step Evaluate created")

    # Currently, the Evaluate step will automatically register
    # the model if it performs better. This step is based on a
    # previous version of the repo which utilized JSON files to
    # track evaluation results.

    # register_model_step = PythonScriptStep(
    #     name="Register New Trained Model",
    #     script_name=register_script_path,
    #     compute_target=aml_compute,
    #     source_directory=sources_directory_train,
    #     arguments=[
    #         "--release_id", release_id,
    #         "--model_name", model_name,
    #     ],
    #     runconfig=run_config,
    #     allow_reuse=False,
    # )
    # print("Step register model created")

    evaluate_step.run_after(train_step)
    # register_model_step.run_after(evaluate_step)
    steps = [evaluate_step]

    train_pipeline = Pipeline(workspace=aml_workspace, steps=steps)
    train_pipeline.validate()
    published_pipeline = train_pipeline.publish(
        name=pipeline_name,
        description="Model training/retraining pipeline",
        version=build_id)
    print(f'Published pipeline: {published_pipeline.name}')
    print(f'for build {published_pipeline.version}')
Exemplo n.º 16
0
                             path_on_datastore=path_on_datastore)
print("DataReference object created")

# Naming the intermediate data as processed_data1 and assigning it to the variable processed_data1.
# raw_data = PipelineData("raw_video_fames", datastore=def_blob_store)
logits_data = PipelineData("logits_from_xception", datastore=def_blob_store)
data_metrics = PipelineData("data_metrics", datastore=def_blob_store)
data_output = PipelineData("output_data", datastore=def_blob_store)

# prepare dataset for training/testing prednet
get_logits_from_xception = PythonScriptStep(
    name='get_logits_from_xception',
    script_name="get_logits_from_xception.py",
    arguments=["--data-folder", labeled_data, "--output_data", logits_data],
    inputs=[labeled_data],
    outputs=[logits_data],
    compute_target=gpu_compute_target,
    source_directory=script_folder,
    runconfig=gpu_compute_run_config,
    allow_reuse=True,
    hash_paths=['.'])
print("logit step created")

# upload data to default datastore
def_blob_store = ws.get_default_datastore()

# script_params = {
#     '--data-folder': def_blob_store.path('256_ObjectCategories_preproc').as_mount(),
#     '--remote_execution': ""
#         estimator_entry_script_arguments=[
#             '--data-folder', preprocessed_data,
Exemplo n.º 17
0
print("Azure Machine Learning Compute attached")

# get pointer to default blob store
def_blob_store = Datastore(ws, "workspaceblobstore")
print("Blobstore's name: {}".format(def_blob_store.name))

# Naming the intermediate data as anomaly data and assigning it to a variable
anomaly_data = PipelineData("anomaly_data", datastore=def_blob_store)
print("Anomaly data object created")

anom_detect = PythonScriptStep(
    name="anomaly_detection",
    # script_name="anom_detect.py",
    script_name="code/anom_detect.py",
    arguments=["--output_directory", anomaly_data],
    outputs=[anomaly_data],
    compute_target=aml_compute,
    source_directory=project_folder,
    allow_reuse=True,
    runconfig=amlcompute_run_config)
print("Anomaly Detection Step created.")

automl_train = PythonScriptStep(
    name="automl_train",
    # script_name="automl_train.py",
    script_name="code/automl_train.py",
    arguments=["--input_directory", anomaly_data],
    inputs=[anomaly_data],
    compute_target=aml_compute,
    source_directory=project_folder,
    allow_reuse=True,
def main():
    load_dotenv()
    workspace_name = os.environ.get("BASE_NAME") + "-AML-WS"
    resource_group = "AML-RG-" + os.environ.get("BASE_NAME")
    subscription_id = os.environ.get("SUBSCRIPTION_ID")
    tenant_id = os.environ.get("TENANT_ID")
    app_id = os.environ.get("SP_APP_ID")
    app_secret = os.environ.get("SP_APP_SECRET")
    sources_directory_train = os.environ.get("SOURCES_DIR_TRAIN")
    train_script_path = os.environ.get("TRAIN_SCRIPT_PATH")
    evaluate_script_path = os.environ.get("EVALUATE_SCRIPT_PATH")
    vm_size = os.environ.get("AML_COMPUTE_CLUSTER_SKU")
    compute_name = os.environ.get("AML_COMPUTE_CLUSTER_NAME")
    aks_name = os.environ.get("AKS_CLUSTER_NAME")
    model_name = os.environ.get("MODEL_NAME")
    build_id = os.environ.get("BUILD_BUILDID")
    pipeline_name = os.environ.get("TRAINING_PIPELINE_NAME")
    experiment_name = os.environ.get("EXPERIMENT_NAME")

    # Get Azure machine learning workspace
    aml_workspace = get_workspace(workspace_name, resource_group,
                                  subscription_id, tenant_id, app_id,
                                  app_secret)
    print(aml_workspace)

    # Get Azure machine learning cluster
    aml_compute = get_compute(aml_workspace, compute_name, vm_size)
    if aml_compute is not None:
        print(aml_compute)

    run_config = RunConfiguration(conda_dependencies=CondaDependencies.create(
        conda_packages=['numpy', 'pandas', 'scikit-learn', 'keras'],
        pip_packages=[
            'azure', 'azureml-sdk', 'azure-storage', 'azure-storage-blob',
            'transformers>=2.1.1', 'tensorflow>=2.0.0', 'tensorflow-gpu>=2.0.0'
        ]))
    run_config.environment.docker.enabled = True

    datastore_name = 'tfworld'
    container_name = 'azure-service-classifier'
    account_name = 'johndatasets'
    sas_token = '?sv=2019-02-02&ss=bfqt&srt=sco&sp=rl&se=2021-06-02T03:40:25Z&st=2020-03-09T19:40:25Z&spr=https&sig=bUwK7AJUj2c%2Fr90Qf8O1sojF0w6wRFgL2c9zMVCWNPA%3D'

    try:
        existing_datastore = Datastore.get(aml_workspace, datastore_name)
    except:  # noqa: E722
        existing_datastore = Datastore \
            .register_azure_blob_container(workspace=aml_workspace,
                                           datastore_name=datastore_name,
                                           container_name=container_name,
                                           account_name=account_name,
                                           sas_token=sas_token
                                           )

    azure_dataset = Dataset.File.from_files(path=(existing_datastore, 'data'))
    azure_dataset = azure_dataset.register(
        workspace=aml_workspace,
        name='Azure Services Dataset',
        description='Dataset containing azure related posts on Stackoverflow',
        create_new_version=True)

    azure_dataset.to_path()
    input_data = azure_dataset.as_named_input('input_data1').as_mount(
        '/tmp/data')

    model_name = PipelineParameter(name="model_name", default_value=model_name)
    max_seq_length = PipelineParameter(name="max_seq_length",
                                       default_value=128)
    learning_rate = PipelineParameter(name="learning_rate", default_value=3e-5)
    num_epochs = PipelineParameter(name="num_epochs", default_value=3)
    export_dir = PipelineParameter(name="export_dir",
                                   default_value="./outputs/exports")
    batch_size = PipelineParameter(name="batch_size", default_value=32)
    steps_per_epoch = PipelineParameter(name="steps_per_epoch",
                                        default_value=100)

    # initialize the TensorFlow estimator
    estimator = TensorFlow(source_directory=sources_directory_train,
                           entry_script=train_script_path,
                           compute_target=aml_compute,
                           framework_version='2.0',
                           use_gpu=True,
                           pip_packages=[
                               'transformers==2.0.0',
                               'azureml-dataprep[fuse,pandas]==1.3.0'
                           ])

    train_step = EstimatorStep(
        name="Train Model",
        estimator=estimator,
        estimator_entry_script_arguments=[
            "--data_dir", input_data, "--max_seq_length", max_seq_length,
            "--learning_rate", learning_rate, "--num_epochs", num_epochs,
            "--export_dir", export_dir, "--batch_size", batch_size,
            "--steps_per_epoch", steps_per_epoch
        ],
        compute_target=aml_compute,
        inputs=[input_data],
        allow_reuse=False,
    )
    print("Step Train created")

    evaluate_step = PythonScriptStep(
        name="Evaluate Model ",
        script_name=evaluate_script_path,
        compute_target=aml_compute,
        source_directory=sources_directory_train,
        arguments=[
            "--model_name",
            model_name,
            "--build_id",
            build_id,
        ],
        runconfig=run_config,
        allow_reuse=False,
    )
    print("Step Evaluate created")

    # Currently, the Evaluate step will automatically register
    # the model if it performs better. This step is based on a
    # previous version of the repo which utilized JSON files to
    # track evaluation results.

    evaluate_step.run_after(train_step)
    steps = [evaluate_step]

    train_pipeline = Pipeline(workspace=aml_workspace, steps=steps)
    train_pipeline.validate()
    published_pipeline = train_pipeline.publish(
        name=pipeline_name,
        description="Model training/retraining pipeline",
        version=build_id)
    print(f'Published pipeline: {published_pipeline.name}')
    print(f'for build {published_pipeline.version}')

    response = published_pipeline.submit(  # noqa: F841
        workspace=aml_workspace,
        experiment_name=experiment_name)

    # Get AKS cluster for deployment
    aks_compute = get_aks(aml_workspace, aks_name)
    if aks_compute is not None:
        print(aks_compute)
    'onnxruntime==1.3.0', 
    'tf2onnx==1.6.3'
])

scripts_folder = 'scripts'
def_blob_store = ws.get_default_datastore()

train_output = PipelineData('train_output', datastore=def_blob_store)
print("train_output PipelineData object created")

trainStep = PythonScriptStep(
    name="train",
    script_name="train.py", 
    arguments=["--model_name", args.model_name, 
              "--build_number", args.build_number, 
              "--output", train_output], 
    outputs=[train_output], 
    compute_target=aml_compute,
    runconfig=run_amlcompute,
    source_directory=scripts_folder,
    allow_reuse=False
)
print("trainStep created")

evaluate_output = PipelineData('evaluate_output', datastore=def_blob_store)

evaluateStep = PythonScriptStep(
    name="evaluate",
    script_name="evaluate.py", 
    arguments=["--model_name", args.model_name, 
               "--build_number", args.build_number, 
               "--input", train_output, 
Exemplo n.º 20
0
run_config.environment = env

# Step 1: Train Model
train_output_dir = PipelineData(name='train_output',
                                pipeline_output_name='train_output',
                                datastore=datastore,
                                output_mode='mount',
                                is_directory=True)

train_step = PythonScriptStep(name='Train Model',
                              source_directory='./src',
                              script_name='train.py',
                              compute_target=compute_target,
                              arguments=[
                                  '--data_dir', dataset, '--checkpoint_dir',
                                  checkpoint, '--tensorflow_models_dir',
                                  tensorflow_models, '--output_dir',
                                  train_output_dir
                              ],
                              inputs=[dataset, checkpoint, tensorflow_models],
                              outputs=[train_output_dir],
                              runconfig=run_config)

# Step 2: Export Model
export_output_dir = PipelineData(name='export_output',
                                 pipeline_output_name='export_output',
                                 datastore=datastore,
                                 output_mode='mount',
                                 is_directory=True)

export_step = PythonScriptStep(
processed_mnist_data

run_config = RunConfiguration()
run_config.environment.docker.enabled = True
run_config.environment.docker.base_image = DEFAULT_CPU_IMAGE
run_config.environment.python.user_managed_dependencies = False
run_config.environment.python.conda_dependencies = CondaDependencies.create(
    pip_packages=['azureml-sdk', 'numpy'])

# source directory
source_directory = 'DataExtraction'

extractDataStep = PythonScriptStep(
    script_name="extract.py",
    arguments=["--output_extract", processed_mnist_data],
    outputs=[processed_mnist_data],
    compute_target=compute_target_cpu,
    source_directory=source_directory,
    runconfig=run_config)

print("Data Extraction Step created")

from azureml.train.dnn import TensorFlow

source_directory = 'Training'
est = TensorFlow(source_directory=source_directory,
                 compute_target=compute_target_cpu,
                 entry_script='train.py',
                 use_gpu=False,
                 framework_version='1.13')
Exemplo n.º 22
0
def build_pipeline(dataset, ws, config):
    print("building pipeline for dataset %s in workspace %s" % (dataset, ws.name))

    base_dir = '.'
        
    def_blob_store = ws.get_default_datastore()

    # folder for scripts that need to be uploaded to Aml compute target
    script_folder = './scripts'
    os.makedirs(script_folder, exist_ok=True)
    
    shutil.copy(os.path.join(base_dir, 'video_decoding.py'), script_folder)
    shutil.copy(os.path.join(base_dir, 'pipelines_submit.py'), script_folder)
    shutil.copy(os.path.join(base_dir, 'pipelines_create.py'), script_folder)
    shutil.copy(os.path.join(base_dir, 'train.py'), script_folder)
    shutil.copy(os.path.join(base_dir, 'data_utils.py'), script_folder)
    shutil.copy(os.path.join(base_dir, 'prednet.py'), script_folder)
    shutil.copy(os.path.join(base_dir, 'keras_utils.py'), script_folder)
    shutil.copy(os.path.join(base_dir, 'data_preparation.py'), script_folder)
    shutil.copy(os.path.join(base_dir, 'model_registration.py'), script_folder)
    shutil.copy(os.path.join(base_dir, 'config.json'), script_folder)
    
    cpu_compute_name = config['cpu_compute']
    try:
        cpu_compute_target = AmlCompute(ws, cpu_compute_name)
        print("found existing compute target: %s" % cpu_compute_name)
    except:# ComputeTargetException:
        print("creating new compute target")
        
        provisioning_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2', 
                                                                    max_nodes=4,
                                                                    idle_seconds_before_scaledown=1800)    
        cpu_compute_target = ComputeTarget.create(ws, cpu_compute_name, provisioning_config)
        cpu_compute_target.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20)
        
    # use get_status() to get a detailed status for the current cluster. 
    print(cpu_compute_target.get_status().serialize())

    # choose a name for your cluster
    gpu_compute_name = config['gpu_compute']

    try:
        gpu_compute_target = AmlCompute(workspace=ws, name=gpu_compute_name)
        print("found existing compute target: %s" % gpu_compute_name)
    except: 
        print('Creating a new compute target...')
        provisioning_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_NC6', 
                                                                    max_nodes=10,
                                                                    idle_seconds_before_scaledown=1800)

        # create the cluster
        gpu_compute_target = ComputeTarget.create(ws, gpu_compute_name, provisioning_config)

        # can poll for a minimum number of nodes and for a specific timeout. 
        # if no min node count is provided it uses the scale settings for the cluster
        gpu_compute_target.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20)

    # use get_status() to get a detailed status for the current cluster. 
    try:
        print(gpu_compute_target.get_status().serialize())
    except BaseException as e:
        print("Could not get status of compute target.")
        print(e)

    # conda dependencies for compute targets
    cpu_cd = CondaDependencies.create(conda_packages=["py-opencv=3.4.2"], pip_indexurl='https://azuremlsdktestpypi.azureedge.net/sdk-release/Candidate/604C89A437BA41BD942B4F46D9A3591D', pip_packages=["azure-storage-blob==1.5.0", "hickle==3.4.3", "requests==2.21.0", "sklearn", "pandas==0.24.2", "azureml-sdk", "numpy==1.16.2", "pillow==6.0.0"])
    
    # Runconfigs
    cpu_compute_run_config = RunConfiguration(conda_dependencies=cpu_cd)
    cpu_compute_run_config.environment.docker.enabled = True
    cpu_compute_run_config.environment.docker.gpu_support = False
    cpu_compute_run_config.environment.docker.base_image = DEFAULT_CPU_IMAGE
    cpu_compute_run_config.environment.spark.precache_packages = False

    print("PipelineData object created")

    # DataReference to where video data is stored.
    video_data = DataReference(
        datastore=def_blob_store,
        data_reference_name="video_data",
        path_on_datastore=os.path.join("prednet", "data", "video", dataset))
    print("DataReference object created")
        
    # Naming the intermediate data as processed_data1 and assigning it to the variable processed_data1.
    raw_data = PipelineData("raw_video_fames", datastore=def_blob_store)
    preprocessed_data = PipelineData("preprocessed_video_frames", datastore=def_blob_store)
    data_metrics = PipelineData("data_metrics", datastore=def_blob_store)
    data_output = PipelineData("output_data", datastore=def_blob_store)

    # prepare dataset for training/testing prednet
    video_decoding = PythonScriptStep(
        name='decode_videos',
        script_name="video_decoding.py", 
        arguments=["--input_data", video_data, "--output_data", raw_data],
        inputs=[video_data],
        outputs=[raw_data],
        compute_target=cpu_compute_target, 
        source_directory=script_folder,
        runconfig=cpu_compute_run_config,
        allow_reuse=True,
        hash_paths=['.']
    )
    print("video_decode step created")

    # prepare dataset for training/testing recurrent neural network
    data_prep = PythonScriptStep(
        name='prepare_data',
        script_name="data_preparation.py", 
        arguments=["--input_data", raw_data, "--output_data", preprocessed_data],
        inputs=[raw_data],
        outputs=[preprocessed_data],
        compute_target=cpu_compute_target, 
        source_directory=script_folder,
        runconfig=cpu_compute_run_config,
        allow_reuse=True,
        hash_paths=['.']
    )
    data_prep.run_after(video_decoding)

    print("data_prep step created")


    # configure access to ACR for pulling our custom docker image
    acr = ContainerRegistry()
    acr.address = config['acr_address']
    acr.username = config['acr_username']
    acr.password = config['acr_password']
    
    est = Estimator(source_directory=script_folder,
                    compute_target=gpu_compute_target,
                    entry_script='train.py', 
                    use_gpu=True,
                    node_count=1,
                    custom_docker_image = "wopauli_1.8-gpu:1",
                    image_registry_details=acr,
                    user_managed=True
                    )

    ps = RandomParameterSampling(
        {
            '--batch_size': choice(1, 2, 4, 8),
            '--filter_sizes': choice("3, 3, 3", "4, 4, 4", "5, 5, 5"),
            '--stack_sizes': choice("48, 96, 192", "36, 72, 144", "12, 24, 48"), #, "48, 96"),
            '--learning_rate': loguniform(-6, -1),
            '--lr_decay': loguniform(-9, -1),
            '--freeze_layers': choice("0, 1, 2", "1, 2, 3", "0, 1", "1, 2", "2, 3", "0", "3"),
            '--transfer_learning': choice("True", "False")
        }
    )

    policy = BanditPolicy(evaluation_interval=2, slack_factor=0.1, delay_evaluation=10)

    hdc = HyperDriveConfig(estimator=est, 
                            hyperparameter_sampling=ps, 
                            policy=policy, 
                            primary_metric_name='val_loss', 
                            primary_metric_goal=PrimaryMetricGoal.MINIMIZE, 
                            max_total_runs=10,
                            max_concurrent_runs=5, 
                            max_duration_minutes=60*6
                            )

    hd_step = HyperDriveStep(
        name="train_w_hyperdrive",
        hyperdrive_run_config=hdc,
        estimator_entry_script_arguments=[
            '--data-folder', preprocessed_data, 
            '--remote_execution',
            '--dataset', dataset
            ],
        inputs=[preprocessed_data],
        metrics_output = data_metrics,
        allow_reuse=True
    )
    hd_step.run_after(data_prep)

    registration_step = PythonScriptStep(
        name='register_model',
        script_name='model_registration.py',
        arguments=['--input_dir', data_metrics, '--output_dir', data_output],
        compute_target=cpu_compute_target,
        inputs=[data_metrics],
        outputs=[data_output],
        source_directory=script_folder,
        allow_reuse=True,
        hash_paths=['.']
    )
    registration_step.run_after(hd_step)

    pipeline = Pipeline(workspace=ws, steps=[video_decoding, data_prep, hd_step, registration_step])
    print ("Pipeline is built")

    pipeline.validate()
    print("Simple validation complete") 

    pipeline_name = 'prednet_' + dataset
    published_pipeline = pipeline.publish(name=pipeline_name)
    

    schedule = Schedule.create(workspace=ws, name=pipeline_name + "_sch",
                            pipeline_id=published_pipeline.id, 
                            experiment_name=pipeline_name,
                            datastore=def_blob_store,
                            wait_for_provisioning=True,
                            description="Datastore scheduler for Pipeline" + pipeline_name,
                            path_on_datastore=os.path.join('prednet/data/video', dataset, 'Train'),
                            polling_interval=1
                            )

    return pipeline_name
Exemplo n.º 23
0
                                      'azureml-sdk', 'tqdm'
                                  ])

# Runconfig
gpu_compute_run_config = RunConfiguration(conda_dependencies=gpu_cd)
gpu_compute_run_config.environment.docker.enabled = True
gpu_compute_run_config.environment.docker.gpu_support = True
gpu_compute_run_config.environment.docker.base_image = DEFAULT_GPU_IMAGE
gpu_compute_run_config.environment.spark.precache_packages = False

# Training step for Xception
train_xception = PythonScriptStep(
    name='train_Xception',
    script_name="train_xception.py",
    arguments=["--data-folder", labeled_data, "--remote_execution"],
    inputs=[labeled_data],
    compute_target=gpu_compute_target,
    source_directory=script_folder,
    runconfig=gpu_compute_run_config,
    allow_reuse=True,
    hash_paths=['.'])
print("training step created")

# Define Pipeline
pipeline = Pipeline(workspace=ws, steps=[train_xception])
print("Pipeline is built")

# Validate Pipeline
pipeline.validate()
print("Validation complete")

pipeline_name = 'kd_train_the_teacher'
Exemplo n.º 24
0
def main():
    e = Env()
    # Get Azure machine learning workspace
    aml_workspace = Workspace.get(name=e.workspace_name,
                                  subscription_id=e.subscription_id,
                                  resource_group=e.resource_group)
    print("get_workspace:")
    print(aml_workspace)

    # Get Azure machine learning cluster
    aml_compute = get_compute(aml_workspace, e.compute_name, e.vm_size)
    if aml_compute is not None:
        print("aml_compute:")
        print(aml_compute)

    # Create a reusable Azure ML environment
    environment = get_environment(aml_workspace,
                                  e.aml_env_name,
                                  create_new=False)  # NOQA: E501

    run_config = RunConfiguration()
    run_config.environment = environment

    if (e.datastore_name):
        datastore_name = e.datastore_name
    else:
        datastore_name = aml_workspace.get_default_datastore().name
    run_config.environment.environment_variables[
        "DATASTORE_NAME"] = datastore_name  # NOQA: E501

    model_name_param = PipelineParameter(name="model_name",
                                         default_value=e.model_name)
    dataset_version_param = PipelineParameter(name="dataset_version",
                                              default_value=e.dataset_version)
    data_file_path_param = PipelineParameter(name="data_file_path",
                                             default_value="none")
    caller_run_id_param = PipelineParameter(name="caller_run_id",
                                            default_value="none")

    # Get dataset name
    dataset_name = e.dataset_name

    # # Check to see if dataset exists
    # if (dataset_name not in aml_workspace.datasets):
    #     # Create dataset from lacemlops sample data
    #     sample_data = load_lacemlops()
    #     df = pd.DataFrame(
    #         data=sample_data.data,
    #         columns=sample_data.feature_names)
    #     df['Y'] = sample_data.target
    #     file_name = 'lacemlops.csv'
    #     df.to_csv(file_name, index=False)

    #     # Upload file to default datastore in workspace
    #     datatstore = Datastore.get(aml_workspace, datastore_name)
    #     target_path = 'training-data/'
    #     datatstore.upload_files(
    #         files=[file_name],
    #         target_path=target_path,
    #         overwrite=True,
    #         show_progress=False)

    #     # Register dataset
    #     path_on_datastore = os.path.join(target_path, file_name)
    #     dataset = Dataset.Tabular.from_delimited_files(
    #         path=(datatstore, path_on_datastore))
    #     dataset = dataset.register(
    #         workspace=aml_workspace,
    #         name=dataset_name,
    #         description='lacemlops training data',
    #         tags={'format': 'CSV'},
    #         create_new_version=True)

    # Create a PipelineData to pass data between steps
    pipeline_data = PipelineData(
        'pipeline_data', datastore=aml_workspace.get_default_datastore())

    train_step = PythonScriptStep(
        name="Train Model",
        script_name=e.train_script_path,
        compute_target=aml_compute,
        source_directory=e.sources_directory_train,
        outputs=[pipeline_data],
        arguments=[
            "--model_name",
            model_name_param,
            "--step_output",
            pipeline_data,
            "--dataset_version",
            dataset_version_param,
            "--data_file_path",
            data_file_path_param,
            "--caller_run_id",
            caller_run_id_param,
            "--dataset_name",
            dataset_name,
        ],
        runconfig=run_config,
        allow_reuse=False,
    )
    print("Step Train created")

    evaluate_step = PythonScriptStep(
        name="Evaluate Model ",
        script_name=e.evaluate_script_path,
        compute_target=aml_compute,
        source_directory=e.sources_directory_train,
        arguments=[
            "--model_name",
            model_name_param,
            "--allow_run_cancel",
            e.allow_run_cancel,
        ],
        runconfig=run_config,
        allow_reuse=False,
    )
    print("Step Evaluate created")

    register_step = PythonScriptStep(
        name="Register Model ",
        script_name=e.register_script_path,
        compute_target=aml_compute,
        source_directory=e.sources_directory_train,
        inputs=[pipeline_data],
        arguments=[
            "--model_name",
            model_name_param,
            "--step_input",
            pipeline_data,
        ],
        runconfig=run_config,
        allow_reuse=False,
    )
    print("Step Register created")
    # Check run_evaluation flag to include or exclude evaluation step.
    if ((e.run_evaluation).lower() == 'true'):
        print("Include evaluation step before register step.")
        evaluate_step.run_after(train_step)
        register_step.run_after(evaluate_step)
        steps = [train_step, evaluate_step, register_step]
    else:
        print("Exclude evaluation step and directly run register step.")
        register_step.run_after(train_step)
        steps = [train_step, register_step]

    train_pipeline = Pipeline(workspace=aml_workspace, steps=steps)
    train_pipeline._set_experiment_name
    train_pipeline.validate()
    published_pipeline = train_pipeline.publish(
        name=e.pipeline_name,
        description="Model training/retraining pipeline",
        version=e.build_id)
    print(f'Published pipeline: {published_pipeline.name}')
    print(f'for build {published_pipeline.version}')
Exemplo n.º 25
0
def main():
    load_dotenv()
    workspace_name = os.environ.get("BASE_NAME") + "-AML-WS"
    resource_group = os.environ.get("RESOURCE_GROUP")
    subscription_id = os.environ.get("SUBSCRIPTION_ID")
    tenant_id = os.environ.get("TENANT_ID")
    app_id = os.environ.get("SP_APP_ID")
    app_secret = os.environ.get("SP_APP_SECRET")
    deploy_script_path = os.environ.get("DEPLOY_SCRIPT_PATH")
    vm_size = os.environ.get("AML_COMPUTE_CLUSTER_CPU_SKU")
    compute_name = os.environ.get("AML_COMPUTE_CLUSTER_NAME")
    model_name = os.environ.get("MODEL_NAME")
    build_id = os.environ.get("BUILD_BUILDID")
    pipeline_name = os.environ.get("DEPLOY_PIPELINE_NAME")
    service_name = os.environ.get("DEPLOY_SERVICE_NAME")
    sources_directory_train = os.environ.get("SOURCES_DIR_TRAIN")

    # Get Azure machine learning workspace
    aml_workspace = get_workspace(workspace_name, resource_group,
                                  subscription_id, tenant_id, app_id,
                                  app_secret)
    print(aml_workspace)

    # Get Azure machine learning cluster
    aml_compute = get_compute(aml_workspace, compute_name, vm_size)
    if aml_compute is not None:
        print(aml_compute)

    conda_dependencies = CondaDependencies.create(
        conda_packages=['numpy', 'pandas', 'scikit-learn'],
        pip_packages=[
            'azureml-core==1.0.72.*', 'azureml-sdk==1.0.72.*', 'azure-storage',
            'azure-storage-blob', 'azureml-dataprep',
            'azureml-datadrift==1.0.72.*'
        ],
        pin_sdk_version=False)

    print(conda_dependencies.serialize_to_string())

    run_config = RunConfiguration(framework='Python',
                                  conda_dependencies=conda_dependencies)
    run_config.environment.docker.enabled = True

    model_name = PipelineParameter(name="model_name", default_value=model_name)
    print(model_name)
    release_id = PipelineParameter(name="release_id", default_value="0")
    print(release_id)
    service_name = PipelineParameter(name="service_name",
                                     default_value=service_name)
    print(service_name)

    deploy_step = PythonScriptStep(
        name="Deploy Model",
        script_name=deploy_script_path,
        compute_target=aml_compute,
        source_directory=sources_directory_train,
        arguments=[
            "--release_id", release_id, "--model_name", model_name,
            "--service_name", service_name
        ],
        runconfig=run_config,
        allow_reuse=False,
    )
    print("Step Deploy created")

    steps = [deploy_step]

    deploy_pipeline = Pipeline(workspace=aml_workspace, steps=steps)
    deploy_pipeline.validate()
    published_pipeline = deploy_pipeline.publish(
        name=pipeline_name,
        description="Model deploy  pipeline",
        version=build_id)
    print(f'Published pipeline: {published_pipeline.name}')
    print(f'for build {published_pipeline.version}')
Exemplo n.º 26
0
def get_pipeline(aml_compute: ComputeTarget, blob_ds: Datastore,
                 batch_env: Environment, tf_env: Environment) -> str:
    """
    Creates pipeline steps
    Parameters:
        aml_compute (ComputeTarget): a reference to a compute
        blob_ds (DataStore): a reference to a datastore
        batch_env (Environment): a reference to environment object
        tf_env (Environment): a horovod/tf environment
    Returns:
        string: a set of pipeline steps
    """

    # We need something to generate data by the way
    pipeline_files = PipelineData("pipeline_files",
                                  datastore=blob_ds).as_dataset()

    # Pipeline parameters to use with every run
    is_debug = PipelineParameter("is_debug", default_value=False)
    relay_connection_name = PipelineParameter("debug_relay_connection_name",
                                              default_value="none")

    single_step_config = RunConfiguration()
    single_step_config.environment = batch_env
    single_step = PythonScriptStep(
        name=f"single-step",
        script_name="samples/azure_ml_advanced/steps/single_step.py",
        source_directory=".",
        runconfig=single_step_config,
        arguments=[
            "--pipeline-files", pipeline_files, "--is-debug", is_debug,
            "--debug-relay-connection-name", relay_connection_name,
            "--debug-port", 5678, "--debug-relay-connection-string-secret",
            debug_connection_string_secret_name
        ],
        inputs=[],
        outputs=[pipeline_files],
        compute_target=aml_compute,
        allow_reuse=False)

    output_dir = PipelineData("output_dir")

    parallel_run_config = ParallelRunConfig(
        entry_script="samples/azure_ml_advanced/steps/parallel_step.py",
        source_directory=".",
        mini_batch_size="5",
        output_action="summary_only",
        environment=batch_env,
        compute_target=aml_compute,
        error_threshold=10,
        run_invocation_timeout=600,  # very important for debugging
        node_count=2,
        process_count_per_node=1)

    parallelrun_step = ParallelRunStep(
        name="parallel-run-step",
        parallel_run_config=parallel_run_config,
        inputs=[pipeline_files],
        output=output_dir,
        arguments=[
            "--is-debug", is_debug, "--debug-relay-connection-name",
            relay_connection_name, "--debug-port", 5679,
            "--debug-relay-connection-string-secret",
            debug_connection_string_secret_name
        ],
        allow_reuse=False)

    parallelrun_step.run_after(single_step)

    distr_config = MpiConfiguration(process_count_per_node=1, node_count=2)

    src = ScriptRunConfig(
        source_directory=".",
        script="samples/azure_ml_advanced/steps/mpi/mpi_step_starter.py",
        arguments=[
            "--input-ds", pipeline_files, "--is-debug", is_debug,
            "--debug-relay-connection-name", relay_connection_name,
            "--debug-port", 5680, "--debug-relay-connection-string-secret",
            debug_connection_string_secret_name
        ],
        compute_target=compute_name,
        environment=tf_env,
        distributed_job_config=distr_config,
    )

    mpi_step = PythonScriptStep(
        name="mpi-step",
        script_name="samples/azure_ml_advanced/steps/mpi/mpi_step_starter.py",
        arguments=[
            "--input-ds", pipeline_files, "--is-debug", is_debug,
            "--debug-relay-connection-name", relay_connection_name,
            "--debug-port", 5680, "--debug-relay-connection-string-secret",
            debug_connection_string_secret_name
        ],
        compute_target=aml_compute,
        inputs=[pipeline_files],
        outputs=[],
        runconfig=src.run_config,
        source_directory=".")

    mpi_step.run_after(parallelrun_step)

    print("Pipeline Steps Created")

    steps = [single_step, parallelrun_step, mpi_step]

    print(f"Returning {len(steps)} steps")
    return steps
Exemplo n.º 27
0
cd = CondaDependencies.create(pip_packages=["azureml-train-automl"])

# Runconfig
amlcompute_run_config = RunConfiguration(framework="python",
                                         conda_dependencies=cd)
amlcompute_run_config.environment.docker.enabled = False
amlcompute_run_config.environment.docker.gpu_support = False
amlcompute_run_config.environment.docker.base_image = DEFAULT_CPU_IMAGE
amlcompute_run_config.environment.spark.precache_packages = False

def_data_store = ws.get_default_datastore()

automl_step = PythonScriptStep(
    name="automl_step",
    script_name="automl_step.py",
    compute_target=aml_compute_target,
    source_directory='.',  #project_folder,
    allow_reuse=True,
    runconfig=amlcompute_run_config)

print("AutoML Training Step created.")

steps = [automl_step]
print("Step lists created")

pipeline = Pipeline(workspace=ws, steps=steps)
print("Pipeline is built")

pipeline.validate()
print("Pipeline validation complete")
Exemplo n.º 28
0
def build_pipeline_steps(automlconfig: AutoMLConfig,
                         data: Dataset,
                         target_column: str,
                         compute_target: ComputeTarget,
                         group_column_names: list,
                         time_column_name: str,
                         deploy: bool,
                         service_name: str = 'grouping-demo') -> StepSequence:
    steps = []

    metrics_output_name = 'metrics_{}'
    best_model_output_name = 'best_model_{}'
    count = 0
    model_names = []

    # get all automl configs by group
    configs = _get_configs(automlconfig, data, target_column, compute_target, group_column_names)

    # build a runconfig for register model
    register_config = RunConfiguration()
    cd = CondaDependencies()
    cd.add_pip_package('azureml-pipeline')
    register_config.environment.python.conda_dependencies = cd

    # create each automl step end-to-end (train, register)
    for group_name, conf in configs.items():
        # create automl metrics output
        metrics_data = PipelineData(
            name='metrics_data_{}'.format(group_name),
            pipeline_output_name=metrics_output_name.format(group_name),
            training_output=TrainingOutput(type='Metrics'))
        # create automl model output
        model_data = PipelineData(
            name='model_data_{}'.format(group_name),
            pipeline_output_name=best_model_output_name.format(group_name),
            training_output=TrainingOutput(type='Model', metric=conf.user_settings['primary_metric']))

        automl_step = AutoMLStep(
            name='automl_{}'.format(group_name),
            automl_config=conf,
            outputs=[metrics_data, model_data],
            allow_reuse=True)
        steps.append(automl_step)

        # pass the group name as a parameter to the register step ->
        # this will become the name of the model for this group.
        group_name_param = PipelineParameter("group_name_{}".format(count), default_value=group_name)
        count += 1

        reg_model_step = PythonScriptStep(
            'register.py',
            name='register_{}'.format(group_name),
            arguments=["--model_name", group_name_param, "--model_path", model_data],
            inputs=[model_data],
            compute_target=compute_target,
            runconfig=register_config,
            source_directory="register",
            allow_reuse=True
        )
        steps.append(reg_model_step)
        model_names.append(group_name)

    final_steps = steps
    if deploy:
        # modify the conda dependencies to ensure we pick up correct
        # versions of azureml-defaults and azureml-train-automl
        cd = CondaDependencies.create(pip_packages=['azureml-defaults', 'azureml-train-automl'])
        automl_deps = CondaDependencies(conda_dependencies_file_path='deploy/myenv.yml')
        cd._merge_dependencies(automl_deps)
        cd.save('deploy/myenv.yml')

        # add deployment step
        pp_group_column_names = PipelineParameter(
            "group_column_names",
            default_value="#####".join(list(reversed(group_column_names))))

        pp_model_names = PipelineParameter(
            "model_names",
            default_value=json.dumps(model_names))

        pp_service_name = PipelineParameter(
            "service_name",
            default_value=service_name)

        deployment_step = PythonScriptStep(
            'deploy.py',
            name='service_deploy',
            arguments=["--group_column_names", pp_group_column_names,
                       "--model_names", pp_model_names,
                       "--service_name", pp_service_name,
                       "--time_column_name", time_column_name],
            compute_target=compute_target,
            runconfig=RunConfiguration(),
            source_directory="deploy"
        )
        final_steps = StepSequence(steps=[steps, deployment_step])

    return final_steps
def get_scoring_pipeline(
    scoring_dataset: Dataset,
    output_loc: PipelineData,
    score_run_config: ParallelRunConfig,
    copy_run_config: RunConfiguration,
    computetarget: ComputeTarget,
    ws: Workspace,
    env: Env,
) -> Pipeline:
    """
    Creates the scoring pipeline.

    :param scoring_dataset: Data to score
    :param output_loc: Location to save the scoring results
    :param score_run_config: Parallel Run configuration to support
    parallelized scoring
    :param copy_run_config: Script Run configuration to support
    score copying
    :param computetarget: AML Compute target
    :param ws: AML Workspace
    :param env: Environment Variables

    :returns: Scoring pipeline instance
    """
    # To help filter the model make the model name, model version and a
    # tag/value pair bindable parameters so that they can be passed to
    # the pipeline when invoked either over REST or via the AML SDK.
    model_name_param = PipelineParameter(
        "model_name", default_value=" "
    )  # NOQA: E501
    model_version_param = PipelineParameter(
        "model_version", default_value=" "
    )  # NOQA: E501
    model_tag_name_param = PipelineParameter(
        "model_tag_name", default_value=" "
    )  # NOQA: E501
    model_tag_value_param = PipelineParameter(
        "model_tag_value", default_value=" "
    )  # NOQA: E501

    scoring_step = ParallelRunStep(
        name="scoringstep",
        inputs=[scoring_dataset],
        output=output_loc,
        arguments=[
            "--model_name",
            model_name_param,
            "--model_version",
            model_version_param,
            "--model_tag_name",
            model_tag_name_param,
            "--model_tag_value",
            model_tag_value_param,
        ],
        parallel_run_config=score_run_config,
        allow_reuse=False,
    )

    copying_step = PythonScriptStep(
        name="scorecopystep",
        script_name=env.batchscore_copy_script_path,
        source_directory=env.sources_directory_train,
        arguments=[
            "--output_path",
            output_loc,
            "--scoring_output_filename",
            env.scoring_datastore_output_filename
            if env.scoring_datastore_output_filename is not None
            else "",
            "--scoring_datastore",
            env.scoring_datastore_storage_name
            if env.scoring_datastore_storage_name is not None
            else "",
            "--score_container",
            env.scoring_datastore_output_container
            if env.scoring_datastore_output_container is not None
            else "",
            "--scoring_datastore_key",
            env.scoring_datastore_access_key
            if env.scoring_datastore_access_key is not None
            else "",
        ],
        inputs=[output_loc],
        allow_reuse=False,
        compute_target=computetarget,
        runconfig=copy_run_config,
    )
    return Pipeline(workspace=ws, steps=[scoring_step, copying_step])
Exemplo n.º 30
0
    conda_packages=['pandas', 'scikit-learn', 'numpy'],
    pip_packages=[
        'azureml-sdk', 'azureml-dataprep', 'azureml-dataprep[pandas]',
        'azureml-train-automl'
    ],
    pin_sdk_version=False)

scripts_folder = './scripts'
prepared_data = PipelineData("diabetes_data_prep", datastore=datastore)

prep_data_step = PythonScriptStep(name="Prep diabetes data",
                                  script_name="prep_data.py",
                                  arguments=[
                                      "--input_file", blob_diabetes_data,
                                      "--output_path", prepared_data
                                  ],
                                  inputs=[blob_diabetes_data],
                                  outputs=[prepared_data],
                                  compute_target=aml_compute,
                                  runconfig=aml_run_config,
                                  source_directory=scripts_folder,
                                  allow_reuse=True)

print("Preparing the 'split train and data' step")
output_split_train_x = PipelineData("diabetes_automl_split_train_x",
                                    datastore=datastore)
output_split_train_y = PipelineData("diabetes_automl_split_train_y",
                                    datastore=datastore)
output_split_test_x = PipelineData("diabetes_automl_split_test_x",
                                   datastore=datastore)
output_split_test_y = PipelineData("diabetes_automl_split_test_y",
                                   datastore=datastore)