Exemplo n.º 1
0
def process_step(datastore: Datastore, compute: ComputeTarget,
                 path_on_datastore: str) -> (PipelineData, EstimatorStep):
    datapath = DataPath(datastore=datastore,
                        path_on_datastore=path_on_datastore)
    data_path_pipeline_param = (PipelineParameter(name="data",
                                                  default_value=datapath),
                                DataPathComputeBinding(mode='mount'))

    seer_tfrecords = PipelineData("tfrecords_set",
                                  datastore=datastore,
                                  is_directory=True)

    prep = TensorFlow(source_directory='.',
                      compute_target=compute,
                      entry_script='prep.py',
                      use_gpu=True,
                      pip_requirements_file='requirements.txt')

    prepStep = EstimatorStep(name='Data Preparation',
                             estimator=prep,
                             estimator_entry_script_arguments=[
                                 "--source_path", data_path_pipeline_param,
                                 "--target_path", seer_tfrecords
                             ],
                             inputs=[data_path_pipeline_param],
                             outputs=[seer_tfrecords],
                             compute_target=compute)

    return seer_tfrecords, prepStep
def hyperparameter_tuning(ws,experiment):
    # Create and submit a Hyperdrive job
    cluster = ws.compute_targets[AML.compute_name]
    script_params={
        '--datastore-dir': ws.get_default_datastore().as_mount(),
    }
    tf_estimator = TensorFlow(source_directory='scripts',
                              compute_target=cluster,
                              entry_script='train.py',
                              script_params=script_params,
                              use_gpu=True)
    ps = RandomParameterSampling(
        {
            '--learning-rate': loguniform(-15, -3)
        }
    )
    early_termination_policy = BanditPolicy(slack_factor = 0.15, evaluation_interval=2)
    hyperdrive_run_config = HyperDriveRunConfig(estimator = tf_estimator, 
                                                hyperparameter_sampling = ps, 
                                                policy = early_termination_policy,
                                                primary_metric_name = "validation_accuracy",
                                                primary_metric_goal = PrimaryMetricGoal.MAXIMIZE,
                                                max_total_runs = 20,
                                                max_concurrent_runs = 4)

    hd_run = experiment.submit(hyperdrive_run_config)
    RunDetails(Run(experiment, hd_run.id)).show()
    return hd_run
Exemplo n.º 3
0
def main(epochs, iterations, compute_target, concurrent_runs):
    cli_auth = AzureCliAuthentication()

    experiment = Experiment.from_directory(".", auth=cli_auth)
    ws = experiment.workspace

    cluster = ws.compute_targets[compute_target]
    food_data = ws.datastores['food_images']

    script_arguments = {"--data-dir": food_data.as_mount(), "--epochs": epochs}

    tf_est = TensorFlow(source_directory=".",
                        entry_script='code/train/train.py',
                        script_params=script_arguments,
                        compute_target=cluster,
                        conda_packages=['pillow', 'pandas'],
                        pip_packages=['click', 'seaborn'],
                        use_docker=True,
                        use_gpu=True,
                        framework_version='1.13')

    # Run on subset of food categories
    tf_est.run_config.arguments.extend(
        ['apple_pie', 'baby_back_ribs', 'baklava', 'beef_carpaccio'])

    param_sampler = RandomParameterSampling({
        '--minibatch-size':
        choice(16, 32, 64),
        '--learning-rate':
        loguniform(-9, -6),
        '--optimizer':
        choice('rmsprop', 'adagrad', 'adam')
    })

    # Create Early Termination Policy
    etpolicy = BanditPolicy(evaluation_interval=2, slack_factor=0.1)

    # Create HyperDrive Run Configuration
    hyper_drive_config = HyperDriveConfig(
        estimator=tf_est,
        hyperparameter_sampling=param_sampler,
        policy=etpolicy,
        primary_metric_name='acc',
        primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
        max_total_runs=iterations,
        max_concurrent_runs=concurrent_runs)

    # Submit the Hyperdrive Run
    print("Submitting Hyperdrive Run")
    hd_run = experiment.submit(hyper_drive_config)
    hd_run.wait_for_completion(raise_on_error=True, show_output=True)
    print("Finishing Run")
    best_run = hd_run.get_best_run_by_primary_metric()
    print(f'##vso[task.setvariable variable=run_id]{best_run.id}')
Exemplo n.º 4
0
 def getAMLTensorFlowEstimator(self, compute_target, ws, cfg):
     self._compute_target = compute_target
     self._doctor_script_parameters_for_datastore(ws, cfg)
     return TensorFlow(
         source_directory=self._source_directory,
         compute_target=self._compute_target,
         entry_script=self._entry_script,
         script_params=self._script_params,
         node_count=self._node_count,
         process_count_per_node=self._process_count_per_node,
         distributed_backend=self._distributed_backend,
         use_gpu=self._use_gpu,
         use_docker=self._use_docker,
         pip_packages=self._pip_packages,
         environment_definition=self._environment_definition,
         inputs=self._inputs)
Exemplo n.º 5
0
def train_step(datastore: Datastore, input_data: PipelineData,
               compute: ComputeTarget) -> (PipelineData, EstimatorStep):
    seer_training = PipelineData("train",
                                 datastore=datastore,
                                 is_directory=True)

    train = TensorFlow(source_directory='.',
                       compute_target=compute,
                       entry_script='train.py',
                       use_gpu=True,
                       pip_requirements_file='requirements.txt')

    trainStep = EstimatorStep(name='Model Training',
                              estimator=train,
                              estimator_entry_script_arguments=[
                                  "--source_path", input_data, "--target_path",
                                  seer_training, "--epochs", 15, "--batch", 10,
                                  "--lr", 0.001
                              ],
                              inputs=[input_data],
                              outputs=[seer_training],
                              compute_target=compute)

    return seer_training, trainStep
Exemplo n.º 6
0
        "--extra_path":
        os.path.join(
            f"https/{ab.storage_account}.blob.core.windows.net/pub",
            f"{experiment_id}/",  # Seems a bug in Azure SDK
        ),
        "--experiment_id":
        f"{experiment_id}",
        "--epochs":
        f"{epochs}",
    }

    script_folder = "./"
    estimator = TensorFlow(
        source_directory=script_folder,
        compute_target=compute_target,
        script_params=script_params,
        entry_script="trainer/train_backbone.py",
        framework_version="2.1",
        environment_definition=env,
    )

    run: Run = experiment.submit(estimator)

    run.wait_for_completion(show_output=True)

    es = ExperimentStorage(ws, experiment_id)
    es.download_output(run)

    # Monitoring experiments
    # https://docs.microsoft.com/en-us/azure/machine-learning/how-to-track-experiments
Exemplo n.º 7
0
def main(args):
    logging.info('Main started.')

    # Define workspace object
    try:
        ws = Workspace.from_config(path='config.json')
    # Need to create the workspace
    except Exception as err:
        print('No workspace.  Check for config.json file.')
        assert False
        # ws = Workspace.create(name=os.getenv('WORKSPACE_NAME', ''),
        #             subscription_id=os.getenv('AZURE_SUB', ''),
        #             resource_group=os.getenv('RESOURCE_GROUP', ''),
        #             create_resource_group=True,
        #             location='westus2'))
        # print("Created workspace {} at location {}".format(ws.name, ws.location))

    # choose a name for your cluster - under 16 characters
    cluster_name = "gpuforkeras"

    try:
        compute_target = ComputeTarget(workspace=ws, name=cluster_name)
        print('Found existing compute target.')
    except ComputeTargetException:
        print('Creating a new compute target...')
        # AML Compute config - if max_nodes are set, it becomes persistent storage that scales
        compute_config = AmlCompute.provisioning_configuration(
            vm_size='STANDARD_NC6', min_nodes=0, max_nodes=5)
        # create the cluster
        compute_target = ComputeTarget.create(ws, cluster_name, compute_config)
        compute_target.wait_for_completion(show_output=True)

    # use get_status() to get a detailed status for the current cluster.
    # print(compute_target.get_status().serialize())

    # # Create a project directory and copy training script to ii
    project_folder = os.path.join(os.getcwd(), 'project')

    # Create an experiment
    experiment_name = args.experiment_name
    experiment = Experiment(ws, name=experiment_name)

    # # Use an AML Data Store for training data
    ds = Datastore.register_azure_blob_container(
        workspace=ws,
        datastore_name=args.datastore_name,
        container_name=os.getenv('STORAGE_CONTAINER_NAME_TRAINDATA', ''),
        account_name=os.getenv('STORAGE_ACCOUNT_NAME', ''),
        account_key=os.getenv('STORAGE_ACCOUNT_KEY', ''),
        create_if_not_exists=True)

    # Set up for training
    script_params = {
        # --data_path is a Python object that will mount the
        #   datastore to the compute target in next step (linking
        #   to Blob Storage)
        '--data_path': ds.as_mount(),
        '--data_dir': args.data_dir,
        '--gpu_num': args.gpu_num,
        '--class_path': args.class_path,
        '--num_clusters': args.num_clusters,
        '--batch_size': args.batch_size,
        '--learning_rate': args.learning_rate
    }

    # Instantiate TensorFlow estimator to call training script
    estimator = TensorFlow(source_directory=project_folder,
                           script_params=script_params,
                           compute_target=compute_target,
                           entry_script='train_azureml.py',
                           pip_packages=[
                               'keras==2.2.4', 'matplotlib==3.1.1',
                               'opencv-python==4.1.1.26', 'Pillow', 'numpy',
                               'configparser', 'python-dotenv',
                               'tensorflow==1.13.1'
                           ],
                           use_gpu=True,
                           framework_version='1.13')

    # Submit and wait for run to complete - check experiment in Azure Portal for progress
    run = experiment.submit(estimator)
    print(run.get_details())
    run.wait_for_completion(show_output=True)

    # Register models to Workspace
    model = run.register_model(
        model_name='keras-dnn-intermediate',
        model_path='./outputs/trained_weights_intermediate.h5',
        tags={
            'framework': "Keras",
            'task': "object detection"
        },
        description="Custom Keras YOLOv3 model - before fine-tuning phase")
    model = run.register_model(
        model_name='keras-dnn',
        model_path='./outputs/trained_weights_final.h5',
        tags={
            'framework': "Keras",
            'task': "object detection"
        },
        description="Custom Keras YOLOv3 model - final, after fine-tuning phase"
    )
def main():
    load_dotenv()
    workspace_name = os.environ.get("BASE_NAME") + "-AML-WS"
    resource_group = "AML-RG-" + os.environ.get("BASE_NAME")
    subscription_id = os.environ.get("SUBSCRIPTION_ID")
    tenant_id = os.environ.get("TENANT_ID")
    app_id = os.environ.get("SP_APP_ID")
    app_secret = os.environ.get("SP_APP_SECRET")
    sources_directory_train = os.environ.get("SOURCES_DIR_TRAIN")
    train_script_path = os.environ.get("TRAIN_SCRIPT_PATH")
    evaluate_script_path = os.environ.get("EVALUATE_SCRIPT_PATH")
    vm_size = os.environ.get("AML_COMPUTE_CLUSTER_SKU")
    compute_name = os.environ.get("AML_COMPUTE_CLUSTER_NAME")
    aks_name = os.environ.get("AKS_CLUSTER_NAME")
    model_name = os.environ.get("MODEL_NAME")
    build_id = os.environ.get("BUILD_BUILDID")
    pipeline_name = os.environ.get("TRAINING_PIPELINE_NAME")
    experiment_name = os.environ.get("EXPERIMENT_NAME")

    # Get Azure machine learning workspace
    aml_workspace = get_workspace(workspace_name, resource_group,
                                  subscription_id, tenant_id, app_id,
                                  app_secret)
    print(aml_workspace)

    # Get Azure machine learning cluster
    aml_compute = get_compute(aml_workspace, compute_name, vm_size)
    if aml_compute is not None:
        print(aml_compute)

    run_config = RunConfiguration(conda_dependencies=CondaDependencies.create(
        conda_packages=['numpy', 'pandas', 'scikit-learn', 'keras'],
        pip_packages=[
            'azure', 'azureml-sdk', 'azure-storage', 'azure-storage-blob',
            'transformers>=2.1.1', 'tensorflow>=2.0.0', 'tensorflow-gpu>=2.0.0'
        ]))
    run_config.environment.docker.enabled = True

    datastore_name = 'tfworld'
    container_name = 'azure-service-classifier'
    account_name = 'johndatasets'
    sas_token = '?sv=2019-02-02&ss=bfqt&srt=sco&sp=rl&se=2021-06-02T03:40:25Z&st=2020-03-09T19:40:25Z&spr=https&sig=bUwK7AJUj2c%2Fr90Qf8O1sojF0w6wRFgL2c9zMVCWNPA%3D'

    try:
        existing_datastore = Datastore.get(aml_workspace, datastore_name)
    except:  # noqa: E722
        existing_datastore = Datastore \
            .register_azure_blob_container(workspace=aml_workspace,
                                           datastore_name=datastore_name,
                                           container_name=container_name,
                                           account_name=account_name,
                                           sas_token=sas_token
                                           )

    azure_dataset = Dataset.File.from_files(path=(existing_datastore, 'data'))
    azure_dataset = azure_dataset.register(
        workspace=aml_workspace,
        name='Azure Services Dataset',
        description='Dataset containing azure related posts on Stackoverflow',
        create_new_version=True)

    azure_dataset.to_path()
    input_data = azure_dataset.as_named_input('input_data1').as_mount(
        '/tmp/data')

    model_name = PipelineParameter(name="model_name", default_value=model_name)
    max_seq_length = PipelineParameter(name="max_seq_length",
                                       default_value=128)
    learning_rate = PipelineParameter(name="learning_rate", default_value=3e-5)
    num_epochs = PipelineParameter(name="num_epochs", default_value=3)
    export_dir = PipelineParameter(name="export_dir",
                                   default_value="./outputs/exports")
    batch_size = PipelineParameter(name="batch_size", default_value=32)
    steps_per_epoch = PipelineParameter(name="steps_per_epoch",
                                        default_value=100)

    # initialize the TensorFlow estimator
    estimator = TensorFlow(source_directory=sources_directory_train,
                           entry_script=train_script_path,
                           compute_target=aml_compute,
                           framework_version='2.0',
                           use_gpu=True,
                           pip_packages=[
                               'transformers==2.0.0',
                               'azureml-dataprep[fuse,pandas]==1.3.0'
                           ])

    train_step = EstimatorStep(
        name="Train Model",
        estimator=estimator,
        estimator_entry_script_arguments=[
            "--data_dir", input_data, "--max_seq_length", max_seq_length,
            "--learning_rate", learning_rate, "--num_epochs", num_epochs,
            "--export_dir", export_dir, "--batch_size", batch_size,
            "--steps_per_epoch", steps_per_epoch
        ],
        compute_target=aml_compute,
        inputs=[input_data],
        allow_reuse=False,
    )
    print("Step Train created")

    evaluate_step = PythonScriptStep(
        name="Evaluate Model ",
        script_name=evaluate_script_path,
        compute_target=aml_compute,
        source_directory=sources_directory_train,
        arguments=[
            "--model_name",
            model_name,
            "--build_id",
            build_id,
        ],
        runconfig=run_config,
        allow_reuse=False,
    )
    print("Step Evaluate created")

    # Currently, the Evaluate step will automatically register
    # the model if it performs better. This step is based on a
    # previous version of the repo which utilized JSON files to
    # track evaluation results.

    evaluate_step.run_after(train_step)
    steps = [evaluate_step]

    train_pipeline = Pipeline(workspace=aml_workspace, steps=steps)
    train_pipeline.validate()
    published_pipeline = train_pipeline.publish(
        name=pipeline_name,
        description="Model training/retraining pipeline",
        version=build_id)
    print(f'Published pipeline: {published_pipeline.name}')
    print(f'for build {published_pipeline.version}')

    response = published_pipeline.submit(  # noqa: F841
        workspace=aml_workspace,
        experiment_name=experiment_name)

    # Get AKS cluster for deployment
    aks_compute = get_aks(aml_workspace, aks_name)
    if aks_compute is not None:
        print(aks_compute)
Exemplo n.º 9
0
    compute_target.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20)

# use get_status() to get a detailed status for the current cluster. 
print(compute_target.get_status().serialize())

script_params = {
    '--data-folder': ws.get_default_datastore().as_mount(),
    '--batch-size': 50,
    '--first-layer-neurons': 300,
    '--second-layer-neurons': 100,
    '--learning-rate': 0.01
}

est = TensorFlow(source_directory=script_folder,
                 script_params=script_params,
                 compute_target=compute_target,
                 entry_script='tf_mnist.py', 
                 use_gpu=True, 
                 framework_version='1.12')

run = exp.submit(est)

run.wait_for_completion(show_output=True, wait_post_processing=True)

# Raise exception if run fails
if run.get_status() == "Failed":
    raise Exception(
        "Training on local failed with following run status: {} and logs: \n {}".format(
            run.get_status(), run.get_details_with_logs()
        )
    )
Exemplo n.º 10
0
    # jobs for both single-node and distributed runs.
    # For more information on the TensorFlow estimator, refer
    # https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-train-tensorflow

    script_params = {
        '--image_dir': str(ds.as_download()),
        '--summaries_dir': './logs',
        '--output_graph': './outputs/output_graph.pb',
        '--output_labels': './outputs/output_labels.txt',
        '--saved_model_dir': './outputs/model'
    }

    estimator = TensorFlow(source_directory=project_folder,
                           source_directory_data_store=ds,
                           compute_target=batch_ai_compute,
                           script_params=script_params,
                           entry_script='retrain.py',
                           pip_packages=['tensorflow_hub'],
                           node_count=1,
                           use_gpu=True)

    # Overwrite data store reference
    dr = DataReferenceConfiguration(
        datastore_name=ds.name,
        path_on_datastore='flower_photos',
        mode='download',  # download files from datastore to compute target
        overwrite=True)
    estimator.run_config.data_references[ds.name] = dr

    # Submit Experiment
    print("Training the model...")
    run = experiment.submit(estimator)
Exemplo n.º 11
0
def build_pipeline(dataset, ws, config):
    print("building pipeline for dataset %s in workspace %s" %
          (dataset, ws.name))

    hostname = socket.gethostname()
    if hostname == 'wopauliNC6':
        base_dir = '.'
    else:
        base_dir = '.'

    def_blob_store = ws.get_default_datastore()

    # folder for scripts that need to be uploaded to Aml compute target
    script_folder = './scripts'
    os.makedirs(script_folder, exist_ok=True)

    shutil.copy(os.path.join(base_dir, 'video_decoding.py'), script_folder)
    shutil.copy(os.path.join(base_dir, 'pipelines_submit.py'), script_folder)
    shutil.copy(os.path.join(base_dir, 'pipelines_build.py'), script_folder)
    shutil.copy(os.path.join(base_dir, 'train.py'), script_folder)
    shutil.copy(os.path.join(base_dir, 'data_utils.py'), script_folder)
    shutil.copy(os.path.join(base_dir, 'prednet.py'), script_folder)
    shutil.copy(os.path.join(base_dir, 'keras_utils.py'), script_folder)
    shutil.copy(os.path.join(base_dir, 'data_preparation.py'), script_folder)
    shutil.copy(os.path.join(base_dir, 'model_registration.py'), script_folder)
    shutil.copy(os.path.join(base_dir, 'config.json'), script_folder)

    cpu_compute_name = config['cpu_compute']
    try:
        cpu_compute_target = AmlCompute(ws, cpu_compute_name)
        print("found existing compute target: %s" % cpu_compute_name)
    except ComputeTargetException:
        print("creating new compute target")

        provisioning_config = AmlCompute.provisioning_configuration(
            vm_size='STANDARD_D2_V2',
            max_nodes=4,
            idle_seconds_before_scaledown=1800)
        cpu_compute_target = ComputeTarget.create(ws, cpu_compute_name,
                                                  provisioning_config)
        cpu_compute_target.wait_for_completion(show_output=True,
                                               min_node_count=None,
                                               timeout_in_minutes=20)

    # use get_status() to get a detailed status for the current cluster.
    print(cpu_compute_target.get_status().serialize())

    # choose a name for your cluster
    gpu_compute_name = config['gpu_compute']

    try:
        gpu_compute_target = AmlCompute(workspace=ws, name=gpu_compute_name)
        print("found existing compute target: %s" % gpu_compute_name)
    except ComputeTargetException:
        print('Creating a new compute target...')
        provisioning_config = AmlCompute.provisioning_configuration(
            vm_size='STANDARD_NC6',
            max_nodes=5,
            idle_seconds_before_scaledown=1800)

        # create the cluster
        gpu_compute_target = ComputeTarget.create(ws, gpu_compute_name,
                                                  provisioning_config)

        # can poll for a minimum number of nodes and for a specific timeout.
        # if no min node count is provided it uses the scale settings for the cluster
        gpu_compute_target.wait_for_completion(show_output=True,
                                               min_node_count=None,
                                               timeout_in_minutes=20)

    # use get_status() to get a detailed status for the current cluster.
    print(gpu_compute_target.get_status().serialize())

    # conda dependencies for compute targets
    cpu_cd = CondaDependencies.create(conda_packages=["py-opencv=3.4.2"],
                                      pip_packages=[
                                          "azure-storage-blob==1.5.0",
                                          "hickle==3.4.3", "requests==2.21.0",
                                          "sklearn", "pandas==0.24.2",
                                          "azureml-sdk==1.0.21",
                                          "numpy==1.16.2", "pillow==6.0.0"
                                      ])
    gpu_cd = CondaDependencies.create(pip_packages=[
        "keras==2.0.8", "theano==1.0.4", "tensorflow==1.8.0",
        "tensorflow-gpu==1.8.0", "hickle==3.4.3", "matplotlib==3.0.3",
        "seaborn==0.9.0", "requests==2.21.0", "bs4==0.0.1", "imageio==2.5.0",
        "sklearn", "pandas==0.24.2", "azureml-sdk==1.0.21", "numpy==1.16.2"
    ])

    # Runconfigs
    cpu_compute_run_config = RunConfiguration(conda_dependencies=cpu_cd)
    cpu_compute_run_config.environment.docker.enabled = True
    cpu_compute_run_config.environment.docker.gpu_support = False
    cpu_compute_run_config.environment.docker.base_image = DEFAULT_CPU_IMAGE
    cpu_compute_run_config.environment.spark.precache_packages = False

    gpu_compute_run_config = RunConfiguration(conda_dependencies=gpu_cd)
    gpu_compute_run_config.environment.docker.enabled = True
    gpu_compute_run_config.environment.docker.gpu_support = True
    gpu_compute_run_config.environment.docker.base_image = DEFAULT_GPU_IMAGE
    gpu_compute_run_config.environment.spark.precache_packages = False

    print("PipelineData object created")

    video_data = DataReference(datastore=def_blob_store,
                               data_reference_name="video_data",
                               path_on_datastore=os.path.join(
                                   "prednet", "data", "video", dataset))

    # Naming the intermediate data as processed_data1 and assigning it to the variable processed_data1.
    raw_data = PipelineData("raw_video_fames", datastore=def_blob_store)
    preprocessed_data = PipelineData("preprocessed_video_frames",
                                     datastore=def_blob_store)
    data_metrics = PipelineData("data_metrics", datastore=def_blob_store)
    data_output = PipelineData("output_data", datastore=def_blob_store)

    print("DataReference object created")

    # prepare dataset for training/testing prednet
    video_decoding = PythonScriptStep(
        name='decode_videos',
        script_name="video_decoding.py",
        arguments=["--input_data", video_data, "--output_data", raw_data],
        inputs=[video_data],
        outputs=[raw_data],
        compute_target=cpu_compute_target,
        source_directory=script_folder,
        runconfig=cpu_compute_run_config,
        allow_reuse=True,
        hash_paths=['.'])
    print("video_decode created")

    # prepare dataset for training/testing recurrent neural network
    data_prep = PythonScriptStep(name='prepare_data',
                                 script_name="data_preparation.py",
                                 arguments=[
                                     "--input_data", raw_data, "--output_data",
                                     preprocessed_data
                                 ],
                                 inputs=[raw_data],
                                 outputs=[preprocessed_data],
                                 compute_target=cpu_compute_target,
                                 source_directory=script_folder,
                                 runconfig=cpu_compute_run_config,
                                 allow_reuse=True,
                                 hash_paths=['.'])
    data_prep.run_after(video_decoding)

    print("data_prep created")

    est = TensorFlow(source_directory=script_folder,
                     compute_target=gpu_compute_target,
                     pip_packages=[
                         'keras==2.0.8', 'theano', 'tensorflow==1.8.0',
                         'tensorflow-gpu==1.8.0', 'matplotlib', 'horovod',
                         'hickle'
                     ],
                     entry_script='train.py',
                     use_gpu=True,
                     node_count=1)

    ps = RandomParameterSampling({
        '--batch_size':
        choice(2, 4, 8, 16),
        '--filter_sizes':
        choice("3, 3, 3", "4, 4, 4", "5, 5, 5"),
        '--stack_sizes':
        choice("48, 96, 192", "36, 72, 144", "12, 24, 48"),  #, "48, 96"),
        '--learning_rate':
        loguniform(-6, -1),
        '--lr_decay':
        loguniform(-9, -1),
        '--freeze_layers':
        choice("0, 1, 2", "1, 2, 3", "0, 1", "1, 2", "2, 3", "0", "1", "2",
               "3"),
        '--transfer_learning':
        choice("True", "False")
    })

    policy = BanditPolicy(evaluation_interval=2,
                          slack_factor=0.1,
                          delay_evaluation=20)

    hdc = HyperDriveRunConfig(
        estimator=est,
        hyperparameter_sampling=ps,
        policy=policy,
        primary_metric_name='val_loss',
        primary_metric_goal=PrimaryMetricGoal.MINIMIZE,
        max_total_runs=5,  #100,
        max_concurrent_runs=5,  #10,
        max_duration_minutes=60 * 6)

    hd_step = HyperDriveStep(name="train_w_hyperdrive",
                             hyperdrive_run_config=hdc,
                             estimator_entry_script_arguments=[
                                 '--data-folder', preprocessed_data,
                                 '--remote_execution'
                             ],
                             inputs=[preprocessed_data],
                             metrics_output=data_metrics,
                             allow_reuse=True)
    hd_step.run_after(data_prep)

    registration_step = PythonScriptStep(
        name='register_model',
        script_name='model_registration.py',
        arguments=['--input_dir', data_metrics, '--output_dir', data_output],
        compute_target=gpu_compute_target,
        inputs=[data_metrics],
        outputs=[data_output],
        source_directory=script_folder,
        allow_reuse=True,
        hash_paths=['.'])
    registration_step.run_after(hd_step)

    pipeline = Pipeline(
        workspace=ws,
        steps=[video_decoding, data_prep, hd_step, registration_step])
    print("Pipeline is built")

    pipeline.validate()
    print("Simple validation complete")

    pipeline_name = 'prednet_' + dataset
    pipeline.publish(name=pipeline_name)

    return pipeline_name
        autoscale_enabled=True,
        min_nodes=0,
        max_nodes=4)
    batch_ai_compute = AmlCompute.create(
        ws,
        name=compute_target_name,
        provisioning_configuration=batch_ai_config)
    batch_ai_compute.wait_for_completion(show_output=True)

# Submit run using TensorFlow estimator
from azureml.train.dnn import TensorFlow

script_params = {"--log_dir": "./logs"}

tf_estimator = TensorFlow(source_directory='./scripts',
                          compute_target=batch_ai_compute,
                          entry_script='mnist_with_summaries.py',
                          script_params=script_params)

run = exp.submit(tf_estimator)

runs.append(run)
run.wait_for_completion(show_output=True)

from azureml.contrib.tensorboard import Tensorboard

# The TensorBoard constructor takes an array of runs...
# and it turns out that we have been building one of those all along.
tb = Tensorboard(runs)

# If successful, start() returns a string with the URI of the instance.
tb.start()
Exemplo n.º 13
0
# Run distributed training by Horovod using built-in ```azureml.train.dnn.TensorFlow``` estimator.    
# If you want to customize more detailed settings (other frameworks, custom images, etc), please use base ```azureml.train.estimator.Estimator``` (parent class).
# 
# ** Note : This estimator (```azureml.train.dnn.TensorFlow```) is an estimator in AML SDK, and not the same as ```tf.estimator.Estimator``` in TensorFlow. Do not confused for the terminology "Estimator".

#%%
from azureml.train.dnn import TensorFlow

script_params={
    '--data_folder': ds_data
}
estimator = TensorFlow(
    source_directory='./script',
    compute_target=compute_target,
    script_params=script_params,
    entry_script='train_horovod.py',
    node_count=2,
    process_count_per_node=1,
    distributed_backend='mpi',
    use_gpu=False)

#%% [markdown]
# ### Step 5 : Run script and wait for completion

#%%
from azureml.core import Experiment

exp = Experiment(workspace=ws, name='tf_distribued')
run = exp.submit(estimator)
run.wait_for_completion(show_output=True)
Exemplo n.º 14
0
    print(f.read())

from azureml.train.dnn import TensorFlow

script_params = {
    '--data-folder': dataset.as_named_input('mnist').as_mount(),
    '--batch-size': 64,
    '--first-layer-neurons': 256,
    '--second-layer-neurons': 128,
    '--learning-rate': 0.01
}

est = TensorFlow(source_directory=script_folder,
                 script_params=script_params,
                 compute_target=compute_target,
                 entry_script='tf_mnist2.py',
                 use_gpu=True,
                 framework_version='2.0',
                 pip_packages=['azureml-dataprep[pandas,fuse]'])

run = exp.submit(est)

#from azureml.widgets import RunDetails

#RunDetails(run).show()

run.wait_for_completion(show_output=True)

run.get_details()

run.get_metrics()
Exemplo n.º 15
0
from azureml.core import Experiment, RunConfiguration, ScriptRunConfig, Workspace, Environment, Model
from azureml.train.dnn import TensorFlow
from azureml.core.conda_dependencies import CondaDependencies

ws = Workspace.from_config()

environment = Environment.from_conda_specification(name="sentiment-env", file_path="experiment-env.yml")
environment.register(ws)
# environment = Environment.get(ws, "sentiment-env")

estimator = TensorFlow(
    source_directory="imdb", 
    entry_script="experiment.py", 
    compute_target="local", 
    framework_version="2.1",  
    script_params={'--n-words': 80000, '--epochs': 2},
    environment_definition=environment
    )

experiment = Experiment(workspace=ws, name="sentiment-analysis")
run = experiment.submit(config=estimator)

run.wait_for_completion(show_output=True)

run.register_model( model_name='sentiment_model',
                    model_path=f'outputs/sentiment_model.h5',
                    description='A sentiment analysis model from imdb data',
                    tags={'source': 'imdb'},
                    model_framework=Model.Framework.TENSORFLOW,
                    model_framework_version='2.2.0',
                    properties={'Accuracy': run.get_metrics()['accuracy']})
Exemplo n.º 16
0
# the training logic is in the keras_mnist.py file.
shutil.copy('./train.py', script_folder)
shutil.copy('./data_utils.py', script_folder)
shutil.copy('./prednet.py', script_folder)
shutil.copy('./keras_utils.py', script_folder)

script_params = {
    '--data-folder': ds.path('prednet').as_mount(),
    '--compute_target': cluster_name
}

est = TensorFlow(source_directory=script_folder,
                 script_params=script_params,
                 compute_target=compute_target,
                 pip_packages=[
                     'keras==2.0.8', 'theano', 'tensorflow==1.8.0',
                     'tensorflow-gpu==1.8.0', 'matplotlib', 'horovod', 'hickle'
                 ],
                 entry_script='train.py',
                 use_gpu=True,
                 node_count=1)

# run = exp.submit(est)

# print(run)

# run.wait_for_completion(show_output=True)

ps = RandomParameterSampling({
    '--batch_size':
    choice(2, 4, 8, 16),
    '--filter_sizes':
def main():
    load_dotenv()
    workspace_name = os.environ.get("BASE_NAME") + "-AML-WS"
    resource_group = os.environ.get("BASE_NAME") + "-AML-RG"
    subscription_id = os.environ.get("SUBSCRIPTION_ID")
    tenant_id = os.environ.get("TENANT_ID")
    app_id = os.environ.get("SP_APP_ID")
    app_secret = os.environ.get("SP_APP_SECRET")
    sources_directory_train = os.environ.get("SOURCES_DIR_TRAIN")
    train_script_path = os.environ.get("TRAIN_SCRIPT_PATH")
    vm_size = os.environ.get("AML_COMPUTE_CLUSTER_CPU_SKU")
    compute_name = os.environ.get("AML_COMPUTE_CLUSTER_NAME")
    model_name = os.environ.get("MODEL_NAME")
    build_id = os.environ.get("BUILD_BUILDID")
    pipeline_name = os.environ.get("TRAINING_PIPELINE_NAME")
    data_path = os.environ.get("DATA_PATH_DATASTORE")
    model_data_path = os.environ.get("MODEL_DATA_PATH_DATASTORE")

    # Get Azure machine learning workspace
    aml_workspace = get_workspace(workspace_name, resource_group,
                                  subscription_id, tenant_id, app_id,
                                  app_secret)
    print(aml_workspace)

    # Get Azure machine learning cluster
    aml_compute = get_compute(aml_workspace, compute_name, vm_size)
    if aml_compute is not None:
        print(aml_compute)

    model_name = PipelineParameter(name="model_name", default_value=model_name)
    release_id = PipelineParameter(name="release_id", default_value="0")

    ds = aml_workspace.get_default_datastore()

    dataref_folder = ds.path(data_path).as_mount()
    model_dataref = ds.path(model_data_path).as_mount()

    # NEED those two folders mounted on datastore and env variables specified in variable groups

    #ds.upload(src_dir='./VOCdevkit', target_path='VOCdevkit', overwrite=True, show_progress=True)
    #ds.upload(src_dir='./model_data', target_path='VOCmodel_data', overwrite=True, show_progress=True)

    yoloEstimator = TensorFlow(
        source_directory=sources_directory_train + '/training',
        compute_target=aml_compute,
        entry_script=train_script_path,
        pip_packages=[
            'keras', 'pillow', 'matplotlib', 'onnxmltools', 'keras2onnx==1.5.1'
        ],  # recent versions of keras2onnx give conversion issues 
        use_gpu=True,
        framework_version='1.13')

    train_step = EstimatorStep(name="Train & Convert Model",
                               estimator=yoloEstimator,
                               estimator_entry_script_arguments=[
                                   "--release_id", release_id, "--model_name",
                                   model_name, "--data_folder", dataref_folder,
                                   "--model_path", model_dataref
                               ],
                               runconfig_pipeline_params=None,
                               inputs=[dataref_folder, model_dataref],
                               compute_target=aml_compute,
                               allow_reuse=False)
    print("Step Train & Convert created")

    train_pipeline = Pipeline(workspace=aml_workspace, steps=[train_step])
    train_pipeline.validate()
    published_pipeline = train_pipeline.publish(
        name=pipeline_name,
        description="Model training/retraining pipeline",
        version=build_id)
    print(f'Published pipeline: {published_pipeline.name}')
    print(f'for build {published_pipeline.version}')
Exemplo n.º 18
0
# that you will need access to on the remote resource. This includes the training script, 
# and any additional files your training script depends on.
import os

project_folder = './tmp/tf-distr-ps'
os.makedirs(project_folder, exist_ok=True)

import shutil
shutil.copy('./scripts/tf_mnist_replica.py', project_folder)

from azureml.train.dnn import TensorFlow

script_params={
    '--num_gpus': 1
}

estimator = TensorFlow(source_directory=project_folder,
                       compute_target=batch_ai_compute,
                       script_params=script_params,
                       entry_script='tf_mnist_replica.py',
                       node_count=2,
                       worker_count=2,
                       parameter_server_count=1,   
                       distributed_backend='ps',
                       use_gpu=True)

run = experiment.submit(estimator)
print(run.get_details())

run.wait_for_completion(show_output=True)
Exemplo n.º 19
0
    compute_target.wait_for_completion(show_output=True)

# Use the 'status' property to get a detailed status for the current AmlCompute.
print(compute_target.status.serialize())

# ### Create the Keras Estimator

# In[ ]:

from azureml.train.dnn import TensorFlow

keras_est = TensorFlow(
    source_directory=project_folder,
    compute_target=compute_target,
    entry_script='train.py',
    conda_packages=['pandas'],
    pip_packages=['keras==2.2.4'],  # just add keras through pip
    use_gpu=True)

# ## Remotely train a deep learning model using the Azure ML Compute
# In the following cells, you will *not* train the model against the data you just downloaded using the resources provided by Azure Notebooks. Instead, you will deploy an Azure ML Compute cluster that will download the data and use a trainings script to train the model. In other words, all of the training will be performed remotely with respect to this notebook.
#

# In[ ]:

# create project folder
if not os.path.exists(project_folder):
    os.makedirs(project_folder)

# ### Create the training script
Exemplo n.º 20
0
voc = Dataset.get_by_name(workspace=ws, name="voc-classification")

# Create estimator
script_params = {
    "--h5_path": voc.as_named_input("voc_classification").as_download(),
    "--outputs_path": "classification",
    "--logs_path": "logs"
}
print("Script parameters:", script_params)

est = TensorFlow(source_directory=script_folder,
                 compute_target=target,
                 entry_script="train.py",
                 use_gpu=True,
                 script_params=script_params,
                 framework_version="2.0",
                 conda_packages=["scikit-learn"],
                 pip_packages=[
                     "azureml-defaults", "matplotlib", "progressbar2",
                     "Pillow", "h5py"
                 ])

# Run experiment
try:
    run = exp.submit(est)
    run.wait_for_completion(show_output=True)
except Exception as e:
    print(e)
    print("Experiment failed")

# Remove folder after use (after all we only copy a few python scripts, not huge data)
Exemplo n.º 21
0
        validtarget_dataset.as_named_input('validtarget_dataset'),
        testtarget_dataset.as_named_input('testtarget_dataset')
    ],
    arguments=['--PreProcessingData', PreProcessingData],
    outputs=[PreProcessingData],
    allow_reuse=True)

print("preprocessing_step")

#######################################################################################################

est = TensorFlow(source_directory='./scripts/train',
                 compute_target=GPU_compute_target,
                 entry_script="estimator_training.py",
                 pip_packages=[
                     'keras<=2.3.1', 'matplotlib', 'opencv-python',
                     'azure-storage-blob==2.1.0', 'tensorflow-gpu==2.0.0'
                 ],
                 conda_packages=['scikit-learn==0.22.1'],
                 use_gpu=True)

est_step = EstimatorStep(name="Estimator_Train",
                         estimator=est,
                         estimator_entry_script_arguments=[
                             '--PreProcessingData', PreProcessingData
                         ],
                         inputs=[PreProcessingData],
                         runconfig_pipeline_params=None,
                         compute_target=GPU_compute_target)

#######################################################################################################
Exemplo n.º 22
0
    compute_target.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20)

# use get_status() to get a detailed status for the current cluster. 

# the training logic is in the keras_mnist.py file.
shutil.copy('./train.py', script_folder)

# the utils.py just helps loading data from the downloaded MNIST dataset into numpy arrays.
shutil.copy('./utils.py', script_folder)

script_params = {
    '--data-folder': ds.path('mnist').as_mount(),
    '--batch-size': 64,
    '--learning-rate': 0.001
}

estimator = TensorFlow(source_directory=script_folder,
                 script_params=script_params,
                 compute_target=compute_target, 
                 pip_packages=['keras', 'matplotlib','sklearn','onnxmltools'],
                 entry_script='train.py', 
                 use_gpu=False)


run = exp.submit(estimator)





extractDataStep = PythonScriptStep(
    script_name="extract.py",
    arguments=["--output_extract", processed_mnist_data],
    outputs=[processed_mnist_data],
    compute_target=compute_target_cpu,
    source_directory=source_directory,
    runconfig=run_config)

print("Data Extraction Step created")

from azureml.train.dnn import TensorFlow

source_directory = 'Training'
est = TensorFlow(source_directory=source_directory,
                 compute_target=compute_target_cpu,
                 entry_script='train.py',
                 use_gpu=False,
                 framework_version='1.13')

from azureml.pipeline.steps import EstimatorStep

trainingStep = EstimatorStep(name="Training-Step",
                             estimator=est,
                             estimator_entry_script_arguments=[
                                 "--input_data_location", processed_mnist_data,
                                 '--batch-size', 50, '--first-layer-neurons',
                                 300, '--second-layer-neurons', 100,
                                 '--learning-rate', 0.01, "--release_id", 0,
                                 '--model_name', model_name
                             ],
                             runconfig_pipeline_params=None,
Exemplo n.º 24
0
        name=batchai_cluster_name,
        provisioning_configuration=provisioning_config)
    compute_target.wait_for_provisioning(show_output=True)

print('create TensorFlow estimator.')
from azureml.train.dnn import TensorFlow
script_params = {
    '--batch-size': 50,
    '--first-layer-neurons': 300,
    '--second-layer-neurons': 100,
    '--learning-rate': 0.01
}

tfe = TensorFlow(project=project,
                 script_params=script_params,
                 compute_target=compute_target,
                 entry_script='mnist_tf.py',
                 use_gpu=True,
                 conda_packages=['scikit-learn'])

print()
print('##################################################')
print('submitting {} for a Batch AI run...'.format(train_script))
print('##################################################')
print()

# start the job
run = tfe.fit()
print(helpers.get_run_history_url(run))
run.wait_for_completion(show_output=True)

print('configure hyperdrive.')
Exemplo n.º 25
0
    enable_optimized_mode = experiment_settings["framework"]["tensorflow"][
        "_enable_optimized_mode"]

    estimator = TensorFlow(
        source_directory=experiment_settings["source_directory"],
        compute_target=compute_target,
        entry_script=experiment_settings["entry_script"],
        script_params=experiment_settings["script_parameters"],
        node_count=experiment_settings["distributed_training"]["node_count"],
        distributed_training=distrib_training_backend,
        use_docker=experiment_settings["docker"]["use_docker"],
        custom_docker_image=experiment_settings["docker"]["custom_image"],
        image_registry_details=container_registry,
        user_managed=experiment_settings["user_managed"],
        conda_packages=experiment_settings["dependencies"]["conda_packages"],
        pip_packages=experiment_settings["dependencies"]["pip_packages"],
        conda_dependencies_file=experiment_settings["dependencies"]
        ["conda_dependencies_file"],
        pip_requirements_file=experiment_settings["dependencies"]
        ["pip_requirements_file"],
        environment_variables=experiment_settings["environment_variables"],
        inputs=experiment_settings["data_references"],
        source_directory_data_store=experiment_settings[
            "source_directory_datastore"],
        shm_size=experiment_settings["docker"]["shm_size"],
        max_run_duration_seconds=experiment_settings[
            "max_run_duration_seconds"],
        framework_version=framework_version,
        _enable_optimized_mode=enable_optimized_mode)

elif experiment_settings["framework"]["name"] == "sklearn":
    framework_version = experiment_settings["framework"]["sklearn"][
Exemplo n.º 26
0
    'loss': 'cross-entropy',
    'network': 'resnet50',
    'batch_size': '4',
    'learning_rate': '0.0001'
})
""" RUN EXPERIMENT AS A SCRIPT ON SOME COMPUTE INSTANCE """

compute_target = ComputeTarget(workspace=workspace,
                               name='compute_instance_name')

estimator = TensorFlow(source_directory='.',
                       script_params=None,
                       entry_script='./scripts/train_network.py',
                       pip_packages=[
                           'numpy', 'opencv-python==4.2.0.34', 'pandas',
                           'scikit-image', 'addict',
                           'git+https://github.com/tensorflow/examples.git',
                           'segmentation-models', 'albumentations'
                       ],
                       compute_target=compute_target,
                       use_gpu=True,
                       framework_version='2.1')

experiment = Experiment(workspace=workspace, name='experiment_name')

run = experiment.submit(estimator)

# get details about experiment run
RunDetails(run).show()
""" DATA MOUNTING """

root = os.getcwd()
Exemplo n.º 27
0
    '--first_layer':
    choice(100, 125, 150),
    '--second_layer':
    choice(30, 60, 90)
})

#%% [markdown]
# ## Generate estimator

#%%
from azureml.train.dnn import TensorFlow

script_params = {'--data_folder': ds_data}
estimator = TensorFlow(source_directory='./script',
                       compute_target=compute_target,
                       script_params=script_params,
                       entry_script='train_experiment.py',
                       use_gpu=False)

#%% [markdown]
# ## Generate run config
#
# Generate run config with an early termnination policy (```BanditPolicy```). With this policy, the training will terminate if the primary metric falls outside of the top 10% range (checking every 2 iterations).

#%%
# early termnination :
# primary metric falls outside of the top 10% (0.1) range by checking every 2 iterations
policy = BanditPolicy(evaluation_interval=2, slack_factor=0.1)
# generate run config
run_config = HyperDriveRunConfig(
    estimator=estimator,
Exemplo n.º 28
0
    def _start_estimator_training(self, training_name: str, estimator_type: str = None, input_datasets: np.array = None, input_datasets_to_download: np.array = None, compute_target:str='local', gpu_compute: bool = False, script_parameters: dict = None, show_widget: bool = True, **kwargs):
        ''' 
        Will start a new training using an Estimator, taking the training name as the folder of the run
        Args:
            training_name (str): The name of a training.  This will be used to create a directory.  Can contain subdirectory
            environment_type (str): one of these values (tensorflow, sklearn, pytorch).  
            input_datasets (np.array): An array of data set names that will be mounted on the compute in a directory of the dataset name
            input_datasets_to_download (np.array): An array of data set names that will be downloaded to the compute in a directory of the dataset name
            compute_target (str): The compute target (default = 'local') on which the training should be executed
            gpu_compute (bool): Indicates if GPU compute is required for this script or not
            script_parameters (dict): A dictionary of key/value parameters that will be passed as arguments to the training script
            show_widget (bool): Will display the live tracking of the submitted Run
        '''
        from azureml.train.estimator import Estimator

        # Check if directory exists
        if not(os.path.exists(training_name) and os.path.isdir(training_name)):
            raise FileNotFoundError(training_name)

        # Check compute target
        if compute_target != 'local':
            self.__check_compute_target(compute_target, gpu_compute)
            

        # Add datasets
        datasets = list()
        if(input_datasets is not None):
            for ds in input_datasets:
                datasets.append(self.__workspace.datasets[ds].as_named_input(ds).as_mount(path_on_compute=ds))
        if(input_datasets_to_download is not None):
            for ds in input_datasets_to_download:
                datasets.append(self.__workspace.datasets[ds].as_named_input(ds).as_download(path_on_compute=ds))

        # as mount - as download
        constructor_parameters = {
            'source_directory':training_name,
            'script_params':script_parameters,
            'inputs':datasets,
            'compute_target':compute_target,
            'entry_script':'train.py',
            'pip_requirements_file':'requirements.txt', 
            'use_gpu':gpu_compute,
            'use_docker':True}
        
        print('Creating estimator of type', estimator_type)

        if(estimator_type is None):
            # Using default Estimator
            estimator = Estimator(**constructor_parameters)
        elif(estimator_type == 'tensorflow'):
            from azureml.train.dnn import TensorFlow
            version_par = 'framework_version'
            if(not version_par in constructor_parameters.keys()):
                print('Defaulting to version 2.0 for TensorFlow')
                constructor_parameters[version_par] = '2.0'
            estimator = TensorFlow(**constructor_parameters)
        elif(estimator_type == 'sklearn'):
            from azureml.train.sklearn import SKLearn
            estimator = SKLearn(**constructor_parameters)
        elif(estimator_type == 'pytorch'):
            from azureml.train.dnn import PyTorch
            estimator = PyTorch(**constructor_parameters)

        # Submit training
        self.__current_run = self.__experiment.submit(estimator)
Exemplo n.º 29
0
try:
    cluster = ComputeTarget(workspace=ws, name=cluster_name)
    print("cluster exist: ", cluster_name)
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size="standard_d12_v2", max_nodes=1)
    cluster = ComputeTarget.create(ws, cluster_name, compute_config)
cluster.wait_for_completion(show_output=True)

exp_name = "exp_bearing_anomaly_lstm"
experiment = Experiment(ws, name=exp_name)

estimator = TensorFlow(
        source_directory='.', 
        entry_script='lstm.py', 
        script_params={'--run_at': 'remote'},
        inputs=[dataset.as_named_input('bearingdata')],
        compute_target=cluster, 
        framework_version='2.0', 
        pip_packages=['scikit-learn==0.22.1', 'seaborn==0.10.1']
        )
run = experiment.submit(estimator)

run.wait_for_completion(show_output=True)
assert(run.get_status() == 'Completed')
print(run.get_file_names())
model = run.register_model(
    model_name='anomaly_detect_lstm_ae', 
    model_path='./outputs/model', 
    description='LSTM AE for anomaly detection', 
    model_framework='Keras', 
    model_framework_version='2.3.1'
Exemplo n.º 30
0
os.makedirs(project_folder, exist_ok=True)

import shutil
shutil.copy('./scripts/train_Fashion_MNIST.py', project_folder)

# Create a TensorFlow estimator
# The AML SDK's TensorFlow estimator enables you to easily submit TensorFlow training
# jobs for both single-node and distributed runs.
# For more information on the TensorFlow estimator, refer
# https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-train-tensorflow
from azureml.train.dnn import TensorFlow

estimator = TensorFlow(source_directory=project_folder,
                       compute_target=batch_ai_compute,
                       entry_script='train_Fashion_MNIST.py',
                       node_count=1,
                       worker_count=1,
                       parameter_server_count=1,
                       conda_packages=['keras', 'matplotlib'],
                       use_gpu=True)

# Submit Experiment
run = experiment.submit(estimator)
run.tag("Description", "Batch AI trained Fashion MNIST model")
run.wait_for_completion(show_output=True)

# Show Metrics
# get all metris logged in the run
run.get_metrics()
metrics = run.get_metrics()

import numpy as np