def main(): load_dotenv() workspace_name = os.environ.get("BASE_NAME")+"-AML-WS" resource_group = os.environ.get("RESOURCE_GROUP") subscription_id = os.environ.get("SUBSCRIPTION_ID") tenant_id = os.environ.get("TENANT_ID") app_id = os.environ.get("SP_APP_ID") app_secret = os.environ.get("SP_APP_SECRET") deploy_script_path = os.environ.get("DEPLOY_PROD_SCRIPT_PATH") vm_size = os.environ.get("AML_COMPUTE_CLUSTER_CPU_SKU") compute_name = os.environ.get("AML_COMPUTE_CLUSTER_NAME") model_name = os.environ.get("MODEL_NAME") build_id = os.environ.get("BUILD_BUILDID") pipeline_name = os.environ.get("DEPLOY_PROD_PIPELINE_NAME") service_name = os.environ.get("DEPLOY_PROD_SERVICE_NAME") sources_directory_train = os.environ.get("SOURCES_DIR_TRAIN") # Get Azure machine learning workspace aml_workspace = get_workspace( workspace_name, resource_group, subscription_id, tenant_id, app_id, app_secret) print(aml_workspace) # Get Azure machine learning cluster aml_compute = get_compute( aml_workspace, compute_name, vm_size) if aml_compute is not None: print(aml_compute) conda_dependencies = CondaDependencies.create( conda_packages=[ 'numpy', 'pandas', 'scikit-learn' ], pip_packages=[ 'azureml-core==1.0.72.*', 'azureml-sdk==1.0.72.*', 'azure-storage', 'azure-storage-blob', 'azureml-dataprep', 'azureml-datadrift==1.0.72.*' ], pin_sdk_version=False ) print(conda_dependencies.serialize_to_string()) run_config = RunConfiguration( framework='Python', conda_dependencies=conda_dependencies ) run_config.environment.docker.enabled = True model_name = PipelineParameter( name="model_name", default_value=model_name ) print(model_name) release_id = PipelineParameter( name="release_id", default_value="0" ) print(release_id) service_name = PipelineParameter( name="service_name", default_value=service_name ) print(service_name) deploy_step = PythonScriptStep( name="Deploy Prod Model", script_name=deploy_script_path, compute_target=aml_compute, source_directory=sources_directory_train, arguments=[ "--release_id", release_id, "--model_name", model_name, "--service_name", service_name ], runconfig=run_config, allow_reuse=False, ) print("Step Deploy Prod created") steps = [deploy_step] deploy_pipeline = Pipeline(workspace=aml_workspace, steps=steps) deploy_pipeline.validate() published_pipeline = deploy_pipeline.publish( name=pipeline_name, description="Model deploy Prod pipeline", version=build_id ) print(f'Published pipeline: {published_pipeline.name}') print(f'for build {published_pipeline.version}')
script_name='model_registration.py', arguments=['--input_dir', data_metrics, '--output_dir', data_output], compute_target=gpu_compute_target, inputs=[data_metrics], outputs=[data_output], source_directory=script_folder, runconfig=gpu_compute_run_config, allow_reuse=True, hash_paths=['.']) registration_step.run_after(hd_step) pipeline = Pipeline( workspace=ws, steps=[get_logits_from_xception, hd_step, registration_step]) print("Pipeline is built") pipeline.validate() print("Simple validation complete") pipeline_name = 'kd_teach_the_student' # We need to disable (delete) previously published pipelines, because we can't have two published pipelines with the same name from utils.azure import disable_pipeline disable_pipeline(pipeline_name=pipeline_name, prefix='', dry_run=False) published_pipeline = pipeline.publish(name=pipeline_name) print("Student pipeline published") schedule = Schedule.create(workspace=ws, name=pipeline_name + "_sch", pipeline_id=published_pipeline.id, experiment_name=pipeline_name,
def build_pipeline(dataset, ws, config): print("building pipeline for dataset %s in workspace %s" % (dataset, ws.name)) base_dir = '.' def_blob_store = ws.get_default_datastore() # folder for scripts that need to be uploaded to Aml compute target script_folder = './scripts' os.makedirs(script_folder, exist_ok=True) shutil.copy(os.path.join(base_dir, 'video_decoding.py'), script_folder) shutil.copy(os.path.join(base_dir, 'pipelines_submit.py'), script_folder) shutil.copy(os.path.join(base_dir, 'pipelines_create.py'), script_folder) shutil.copy(os.path.join(base_dir, 'train.py'), script_folder) shutil.copy(os.path.join(base_dir, 'data_utils.py'), script_folder) shutil.copy(os.path.join(base_dir, 'prednet.py'), script_folder) shutil.copy(os.path.join(base_dir, 'keras_utils.py'), script_folder) shutil.copy(os.path.join(base_dir, 'data_preparation.py'), script_folder) shutil.copy(os.path.join(base_dir, 'model_registration.py'), script_folder) shutil.copy(os.path.join(base_dir, 'config.json'), script_folder) cpu_compute_name = config['cpu_compute'] try: cpu_compute_target = AmlCompute(ws, cpu_compute_name) print("found existing compute target: %s" % cpu_compute_name) except:# ComputeTargetException: print("creating new compute target") provisioning_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2', max_nodes=4, idle_seconds_before_scaledown=1800) cpu_compute_target = ComputeTarget.create(ws, cpu_compute_name, provisioning_config) cpu_compute_target.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20) # use get_status() to get a detailed status for the current cluster. print(cpu_compute_target.get_status().serialize()) # choose a name for your cluster gpu_compute_name = config['gpu_compute'] try: gpu_compute_target = AmlCompute(workspace=ws, name=gpu_compute_name) print("found existing compute target: %s" % gpu_compute_name) except: print('Creating a new compute target...') provisioning_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_NC6', max_nodes=10, idle_seconds_before_scaledown=1800) # create the cluster gpu_compute_target = ComputeTarget.create(ws, gpu_compute_name, provisioning_config) # can poll for a minimum number of nodes and for a specific timeout. # if no min node count is provided it uses the scale settings for the cluster gpu_compute_target.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20) # use get_status() to get a detailed status for the current cluster. try: print(gpu_compute_target.get_status().serialize()) except BaseException as e: print("Could not get status of compute target.") print(e) # conda dependencies for compute targets cpu_cd = CondaDependencies.create(conda_packages=["py-opencv=3.4.2"], pip_indexurl='https://azuremlsdktestpypi.azureedge.net/sdk-release/Candidate/604C89A437BA41BD942B4F46D9A3591D', pip_packages=["azure-storage-blob==1.5.0", "hickle==3.4.3", "requests==2.21.0", "sklearn", "pandas==0.24.2", "azureml-sdk", "numpy==1.16.2", "pillow==6.0.0"]) # Runconfigs cpu_compute_run_config = RunConfiguration(conda_dependencies=cpu_cd) cpu_compute_run_config.environment.docker.enabled = True cpu_compute_run_config.environment.docker.gpu_support = False cpu_compute_run_config.environment.docker.base_image = DEFAULT_CPU_IMAGE cpu_compute_run_config.environment.spark.precache_packages = False print("PipelineData object created") # DataReference to where video data is stored. video_data = DataReference( datastore=def_blob_store, data_reference_name="video_data", path_on_datastore=os.path.join("prednet", "data", "video", dataset)) print("DataReference object created") # Naming the intermediate data as processed_data1 and assigning it to the variable processed_data1. raw_data = PipelineData("raw_video_fames", datastore=def_blob_store) preprocessed_data = PipelineData("preprocessed_video_frames", datastore=def_blob_store) data_metrics = PipelineData("data_metrics", datastore=def_blob_store) data_output = PipelineData("output_data", datastore=def_blob_store) # prepare dataset for training/testing prednet video_decoding = PythonScriptStep( name='decode_videos', script_name="video_decoding.py", arguments=["--input_data", video_data, "--output_data", raw_data], inputs=[video_data], outputs=[raw_data], compute_target=cpu_compute_target, source_directory=script_folder, runconfig=cpu_compute_run_config, allow_reuse=True, hash_paths=['.'] ) print("video_decode step created") # prepare dataset for training/testing recurrent neural network data_prep = PythonScriptStep( name='prepare_data', script_name="data_preparation.py", arguments=["--input_data", raw_data, "--output_data", preprocessed_data], inputs=[raw_data], outputs=[preprocessed_data], compute_target=cpu_compute_target, source_directory=script_folder, runconfig=cpu_compute_run_config, allow_reuse=True, hash_paths=['.'] ) data_prep.run_after(video_decoding) print("data_prep step created") # configure access to ACR for pulling our custom docker image acr = ContainerRegistry() acr.address = config['acr_address'] acr.username = config['acr_username'] acr.password = config['acr_password'] est = Estimator(source_directory=script_folder, compute_target=gpu_compute_target, entry_script='train.py', use_gpu=True, node_count=1, custom_docker_image = "wopauli_1.8-gpu:1", image_registry_details=acr, user_managed=True ) ps = RandomParameterSampling( { '--batch_size': choice(1, 2, 4, 8), '--filter_sizes': choice("3, 3, 3", "4, 4, 4", "5, 5, 5"), '--stack_sizes': choice("48, 96, 192", "36, 72, 144", "12, 24, 48"), #, "48, 96"), '--learning_rate': loguniform(-6, -1), '--lr_decay': loguniform(-9, -1), '--freeze_layers': choice("0, 1, 2", "1, 2, 3", "0, 1", "1, 2", "2, 3", "0", "3"), '--transfer_learning': choice("True", "False") } ) policy = BanditPolicy(evaluation_interval=2, slack_factor=0.1, delay_evaluation=10) hdc = HyperDriveConfig(estimator=est, hyperparameter_sampling=ps, policy=policy, primary_metric_name='val_loss', primary_metric_goal=PrimaryMetricGoal.MINIMIZE, max_total_runs=10, max_concurrent_runs=5, max_duration_minutes=60*6 ) hd_step = HyperDriveStep( name="train_w_hyperdrive", hyperdrive_run_config=hdc, estimator_entry_script_arguments=[ '--data-folder', preprocessed_data, '--remote_execution', '--dataset', dataset ], inputs=[preprocessed_data], metrics_output = data_metrics, allow_reuse=True ) hd_step.run_after(data_prep) registration_step = PythonScriptStep( name='register_model', script_name='model_registration.py', arguments=['--input_dir', data_metrics, '--output_dir', data_output], compute_target=cpu_compute_target, inputs=[data_metrics], outputs=[data_output], source_directory=script_folder, allow_reuse=True, hash_paths=['.'] ) registration_step.run_after(hd_step) pipeline = Pipeline(workspace=ws, steps=[video_decoding, data_prep, hd_step, registration_step]) print ("Pipeline is built") pipeline.validate() print("Simple validation complete") pipeline_name = 'prednet_' + dataset published_pipeline = pipeline.publish(name=pipeline_name) schedule = Schedule.create(workspace=ws, name=pipeline_name + "_sch", pipeline_id=published_pipeline.id, experiment_name=pipeline_name, datastore=def_blob_store, wait_for_provisioning=True, description="Datastore scheduler for Pipeline" + pipeline_name, path_on_datastore=os.path.join('prednet/data/video', dataset, 'Train'), polling_interval=1 ) return pipeline_name
def main(): e = Env() # Get Azure machine learning workspace aml_workspace = Workspace.get( name=e.workspace_name, subscription_id=e.subscription_id, resource_group=e.resource_group, ) print(f"get_workspace:{aml_workspace}") # Get Azure machine learning cluster aml_compute = get_compute(aml_workspace, e.compute_name, e.vm_size) if aml_compute is not None: print(f"aml_compute:{aml_compute}") # Create a reusable Azure ML environment environment = get_environment( aml_workspace, e.aml_env_name, create_new=e.rebuild_env, enable_docker=True, dockerfile='ml_model/preprocess/Dockerfile' ) # run_config = RunConfiguration() run_config.environment = environment if e.datastore_name: datastore_name = e.datastore_name else: datastore_name = aml_workspace.get_default_datastore().name run_config.environment.environment_variables["DATASTORE_NAME"] = datastore_name # NOQA: E501 datastore = Datastore(aml_workspace, name=datastore_name) data_file_path_param = PipelineParameter(name="data_file_path", default_value=e.dataset_name) # NOQA: E501 # The version of the input/output dataset can't be determined at pipeline publish time, only run time. # NOQA: E501 # Options to store output data: # Option 1: Use blob API to write output data. Otherwise, no way to dynamically change the output dataset based on PipelineParameter, # NOQA: E501 # The following will not work. It generate a path like "PipelineParameter_Name:data_file_path_Default:gear_images" # NOQA: E501 # output_ds = OutputFileDatasetConfig(destination=(datastore, data_file_path_param)) # NOQA: E501 # This option means writing a file locally and upload to the datastore. Fewer dataset, more code. # NOQA: E501 # Option 2: Use a dynamic path in OutputFileDatasetConfig, and register a new dataset at completion # NOQA: E501 # Output dataset can be mounted, so more dataset to maintain, less code. # NOQA: E501 # Using Option 2 below. output_dataset = OutputFileDatasetConfig( name=e.processed_dataset_name, destination=(datastore, "/dataset/{output-name}/{run-id}") ).register_on_complete( name=e.processed_dataset_name) preprocess_step = PythonScriptStep( name="Preprocess Data with OS cmd", script_name='preprocess/preprocess_os_cmd_aml.py', compute_target=aml_compute, source_directory=e.sources_directory_train, arguments=[ "--dataset_name", e.dataset_name, "--datastore_name", datastore_name, "--data_file_path", data_file_path_param, "--output_dataset", output_dataset, ], runconfig=run_config, allow_reuse=False, ) print("Step Preprocess OS cmd created") steps = [preprocess_step] preprocess_pipeline = Pipeline(workspace=aml_workspace, steps=steps) preprocess_pipeline._set_experiment_name preprocess_pipeline.validate() published_pipeline = preprocess_pipeline.publish( name=e.preprocessing_pipeline_name, description="Data preprocessing OS cmd pipeline", version=e.build_id, ) print(f"Published pipeline: {published_pipeline.name}") print(f"for build {published_pipeline.version}")
def main(): load_dotenv() workspace_name = os.environ.get("WS_NAME") resource_group = os.environ.get("RG_NAME") subscription_id = os.environ.get("SUBSCRIPTION_ID") tenant_id = os.environ.get("TENANT_ID") app_id = os.environ.get("SP_APP_ID") app_secret = os.environ.get("SP_APP_SECRET") sources_directory_train = os.environ.get("SOURCES_DIR_TRAIN") train_script_path = os.environ.get("TRAIN_SCRIPT_PATH") evaluate_script_path = os.environ.get("EVALUATE_SCRIPT_PATH") vm_size = os.environ.get("AML_COMPUTE_CLUSTER_SKU") compute_name = os.environ.get("AML_COMPUTE_CLUSTER_NAME") aks_name = os.environ.get("AKS_CLUSTER_NAME") model_name = os.environ.get("MODEL_NAME") build_id = os.environ.get("BUILD_BUILDID") pipeline_name = os.environ.get("TRAINING_PIPELINE_NAME") experiment_name = os.environ.get("EXPERIMENT_NAME") # Get Azure machine learning workspace aml_workspace = get_workspace( workspace_name, resource_group, subscription_id, tenant_id, app_id, app_secret) print('Now accessing:') print(aml_workspace) # Get Azure machine learning cluster aml_compute = get_compute( aml_workspace, compute_name, vm_size) if aml_compute is not None: print(aml_compute) run_config = RunConfiguration(conda_dependencies=CondaDependencies.create( conda_packages=['numpy', 'pandas', 'scikit-learn', 'keras'], pip_packages=['azureml-core==1.25.0', 'azureml-defaults==1.25.0', 'azureml-telemetry==1.25.0', 'azureml-train-restclients-hyperdrive==1.25.0', 'azureml-train-core==1.25.0', 'azureml-dataprep', 'tensorflow-gpu==2.0.0', 'transformers==2.0.0', 'absl-py', 'azureml-dataprep', 'h5py<3.0.0']) ) # run_config.environment.docker.enabled = True datastore_name = 'mtcseattle' container_name = 'azure-service-classifier' account_name = 'mtcseattle' sas_token = '?sv=2020-04-08&st=2021-05-26T04%3A39%3A46Z&se=2022-05-27T04%3A39%3A00Z&sr=c&sp=rl&sig=CTFMEu24bo2X06G%2B%2F2aKiiPZBzvlWHELe15rNFqULUk%3D' try: existing_datastore = Datastore.get(aml_workspace, datastore_name) except: # noqa: E722 existing_datastore = Datastore \ .register_azure_blob_container(workspace=aml_workspace, datastore_name=datastore_name, container_name=container_name, account_name=account_name, sas_token=sas_token, overwrite=True) azure_dataset = Dataset.File.from_files( path=(existing_datastore, 'data')) azure_dataset = azure_dataset.register( workspace=aml_workspace, name='Azure Services Dataset', description='Dataset containing azure related posts on Stackoverflow', create_new_version=True) azure_dataset.to_path() input_data = azure_dataset.as_named_input('azureservicedata').as_mount( '/tmp/data') model_name = PipelineParameter( name="model_name", default_value=model_name) max_seq_length = PipelineParameter( name="max_seq_length", default_value=128) learning_rate = PipelineParameter( name="learning_rate", default_value=3e-5) num_epochs = PipelineParameter( name="num_epochs", default_value=1) export_dir = PipelineParameter( name="export_dir", default_value="./outputs/model") batch_size = PipelineParameter( name="batch_size", default_value=32) steps_per_epoch = PipelineParameter( name="steps_per_epoch", default_value=1) # initialize the PythonScriptStep train_step = PythonScriptStep( name='Train Model', script_name=train_script_path, arguments=['--data_dir', input_data, '--max_seq_length', max_seq_length, '--batch_size', batch_size, '--learning_rate', learning_rate, '--steps_per_epoch', steps_per_epoch, '--num_epochs', num_epochs, '--export_dir',export_dir], compute_target=aml_compute, source_directory=sources_directory_train, runconfig=run_config, allow_reuse=True) print("Step Train created") evaluate_step = PythonScriptStep( name="Evaluate Model ", script_name=evaluate_script_path, compute_target=aml_compute, source_directory=sources_directory_train, arguments=[ "--model_name", model_name, "--build_id", build_id, ], runconfig=run_config, allow_reuse=False, ) print("Step Evaluate created") # Currently, the Evaluate step will automatically register # the model if it performs better. This step is based on a # previous version of the repo which utilized JSON files to # track evaluation results. evaluate_step.run_after(train_step) steps = [evaluate_step] train_pipeline = Pipeline(workspace=aml_workspace, steps=steps) train_pipeline.validate() published_pipeline = train_pipeline.publish( name=pipeline_name, description="Model training/retraining pipeline.", version=build_id ) print(f'Published pipeline: {published_pipeline.name}') print(f'for build {published_pipeline.version}') response = published_pipeline.submit( # noqa: F841 workspace=aml_workspace, experiment_name=experiment_name)
class BatchScoringContext(BaseContext): # Data and script information batch_data_directory = './paths/batch/scoring' batch_data_file = 'data.txt' batch_scoring_script = 'batch.py' bach_scoring_results_file = "Results.txt" # Data store information input_store_name = "inputdata" input_reference_name = "inputdataref" output_store_name = "outputdata" output_reference_name = "outputdataref" # Pipeline information pip_packages = [] python_version = "3.6.7" ''' Contains the context needed to perform the tasks. ''' def __init__(self, programArgs, userAuthorization): super().__init__(programArgs, userAuthorization) self.computeTarget = None self.inputDataStore = None self.inputDataReference = None self.outputDataStore = None self.outputDataReference = None self.pipelineStep = None self.pipeLine = None self.publishedPipeline = None def generateStorageContainers(self): ''' We are using the storage associated with the actual AMLS workspace. So, we need to create the container that has the data to be "scored" and a container where results will end up. ''' storage_container_names = [] storage_container_names.append(self.programArguments.source_container) storage_container_names.append(self.programArguments.result_container) storage_details = self.workspace.get_default_datastore() createStorageContainer(storage_details.account_name, storage_details.account_key, storage_container_names) def uploadDataFiles(self): ''' Upload the data files into the source container, these are the files that will be processed by the AML compute cluster. ''' storage_details = self.workspace.get_default_datastore() data_files = BatchScoringContext.batch_data_file.split(",") uploadStorageBlobs(storage_details.account_name, storage_details.account_key, self.programArguments.source_container, BatchScoringContext.batch_data_directory, data_files) def generateCompute(self): ''' Generate the AML compute cluster. ''' if self.computeTarget: return self.computeTarget self.computeTarget = createBatchComputeCluster( self.workspace, self.programArguments.batch_compute_name, self.programArguments.batch_vm_size, self.programArguments.batch_vm_max, self.programArguments.batch_vm_min) if not self.computeTarget: raise Exception("Cannot create compute target.") def createPipelineDataReferences(self): ''' Datastores identify where data is coming from and going to in the process. This function checks to see if a data store with the same name has already been registered. If not, it registers it, if so it uses the exsiting store. Stores are then wrapped in a DataReference object that will be used in the pipeline steps. For this example, we need two references. One for the input data file, one for the output results file. Both reside in the storage account from the AMLS workspace. ''' storage_details = self.workspace.get_default_datastore() ''' Have to create one for input and one for output. self.programArguments.source_container - identifies the container name for the data file self.programArguments.result_container - identifies the container name for the results file(s) ''' requested_datasets = {} requested_datasets["in"] = (self.programArguments.source_container, BatchScoringContext.input_store_name, BatchScoringContext.input_reference_name) requested_datasets["out"] = (self.programArguments.result_container, BatchScoringContext.output_store_name, BatchScoringContext.output_reference_name) for requested in requested_datasets: store, reference = createDataReference( self.workspace, storage_details.account_name, storage_details.account_key, requested_datasets[requested][0], requested_datasets[requested][1], requested_datasets[requested][2]) ''' Put the reference into the class variables. ''' if requested == "in": self.inputDataStore = store self.inputDataReference = reference else: self.outputDataStore = store self.outputDataReference = reference def _createPipelineSteps(self): ''' You first need the conda dependencies that will be baked into the image to be pushed down to the batch compute cluster for a working environment. In this example we don't need anything other than Python. ''' conda_dependencies = CondaDependencies.create( pip_packages=BatchScoringContext.pip_packages, python_version=BatchScoringContext.python_version) run_config = RunConfiguration(conda_dependencies=conda_dependencies) run_config.environment.docker.enabled = True ''' Next we need to let the pipeline know which store the output is going. This is expected to be a PipelineData object. That object expects: name = The directory on the cluster machine in which output is expected. datastore = Identifies the end storage. In this case an Azure Storage account complete with container name and file name in which to deposit in the storage account. ''' prediction_ref = PipelineData(name="preds", datastore=self.outputDataStore, is_directory=True) ''' Next we create a step for a pipeline. WE tell it where out script is, Script information: The directory in which the python script is located on the local machine The file name of the script that will be uploaded. Script Arguments: This is what arguments the script will accept. In our example they are: input file, input directory, output file, output directory inputs: This is a list of data inputs. In this example it is the Azure Storage account/container/file combination that holds our data file. outputs: This is a list of outputs. In this example it is the Azure Storage account/container/file combination that the script creates. compute_target: The compute target we attached to the AML service that will process requests. run_config: This is the conda / python depenencies that the resultant container requires to execute succesfully. ''' self.pipelineStep = PythonScriptStep( name="basic_pipeline_step", source_directory=BatchScoringContext.batch_data_directory, script_name=BatchScoringContext.batch_scoring_script, arguments=[ BatchScoringContext.batch_data_file, self.inputDataReference, BatchScoringContext.bach_scoring_results_file, prediction_ref ], inputs=[self.inputDataReference], outputs=[prediction_ref], compute_target=self.computeTarget, runconfig=run_config, allow_reuse=False, ) if self.pipelineStep == None: raise Exception("Unable to create python step.") def createPipeline(self): ''' A pipeline is a series of steps but also requires DataReference objects in those steps so that it where to get data from and where to deposit outputs. In this step, if a PublishedPipeline exists by name, a new pipeline is not created. If it is created a new docker conainer is generated in the ACR instance associated with this AMLS workspace. ''' self.publishedPipeline = getExistingPipeline( self.workspace, self.programArguments.pipeline_name) if self.publishedPipeline: print("Found existing pipeline - ", self.programArguments.pipeline_name) else: print("Creating pipeline - ", self.programArguments.pipeline_name) print("Creating pipeline steps .....") self._createPipelineSteps() self.pipeLine = Pipeline(workspace=self.workspace, steps=self.pipelineStep) self.pipeLine.validate() print("Publishing pipeline .....") self.publishedPipeline = self.pipeLine.publish( name=self.programArguments.pipeline_name, description="Dummy Pipeline") ''' Now we schedule it. This step on it's own will create the AMLS experiment tied to this service. Unlike with the RTS example, no model is creted in this step. Next we generate the schedule recurrence, when this pipeline should run, and finally create the schedule by identifying the published pipeline that is being requested. ''' print("Scheduling pipeline .....") experiment_name = "exp_" + self.programArguments.pipeline_name recurrence = ScheduleRecurrence( frequency=self.programArguments.schedule_frequency, interval=self.programArguments.schedule_interval) self.Schedule = Schedule.create( workspace=self.workspace, name="{}_sched".format(self.programArguments.pipeline_name), pipeline_id=self.publishedPipeline.id, experiment_name=experiment_name, recurrence=recurrence, description="Pipeline schedule for {}".format( self.programArguments.pipeline_name), ) ''' Print out what we know of the pipeline. In particular it's status and the endpoint. ''' print("Pipeline : ", self.publishedPipeline.name) print("Pipeline Endpoint: ", self.publishedPipeline.endpoint) print("Pipeline Status: ", self.publishedPipeline.status)
def main(): e = Env() # Get Azure machine learning workspace aml_workspace = get_workspace(e.workspace_name, e.resource_group, e.subscription_id, e.tenant_id, e.app_id, e.app_secret) print("get_workspace:") print(aml_workspace) # Get Azure machine learning cluster aml_compute = get_compute(aml_workspace, e.compute_name, e.vm_size) if aml_compute is not None: print("aml_compute:") print(aml_compute) run_config = RunConfiguration(conda_dependencies=CondaDependencies.create( conda_packages=[ 'numpy', 'pandas', 'scikit-learn', 'tensorflow', 'keras' ], pip_packages=[ 'azure', 'azureml-core', 'azure-storage', 'azure-storage-blob' ])) run_config.environment.docker.enabled = True model_name = PipelineParameter(name="model_name", default_value=e.model_name) release_id = PipelineParameter(name="release_id", default_value="0") train_step = PythonScriptStep( name="Train Model", script_name=e.train_script_path, compute_target=aml_compute, source_directory=e.sources_directory_train, arguments=[ "--release_id", release_id, "--model_name", model_name, ], runconfig=run_config, allow_reuse=False, ) print("Step Train created") evaluate_step = PythonScriptStep( name="Evaluate Model ", script_name=e.evaluate_script_path, compute_target=aml_compute, source_directory=e.sources_directory_train, arguments=[ "--release_id", release_id, "--model_name", model_name, ], runconfig=run_config, allow_reuse=False, ) print("Step Evaluate created") evaluate_step.run_after(train_step) steps = [evaluate_step] train_pipeline = Pipeline(workspace=aml_workspace, steps=steps) train_pipeline.validate() published_pipeline = train_pipeline.publish( name=e.pipeline_name, description="Model training/retraining pipeline", version=e.build_id) print(f'Published pipeline: {published_pipeline.name}') print(f'for build {published_pipeline.version}')
def build_prednet_pipeline(dataset, ws): print("building pipeline for dataset %s in workspace %s" % (dataset, ws.name)) base_dir = "." def_blob_store = ws.get_default_datastore() # folder for scripts that need to be uploaded to Aml compute target script_folder = "./scripts" os.makedirs(script_folder) shutil.copytree(os.path.join(base_dir, "models"), os.path.join(base_dir, script_folder, "models")) shutil.copy(os.path.join(base_dir, "train.py"), script_folder) shutil.copy(os.path.join(base_dir, "data_preparation.py"), script_folder) shutil.copy(os.path.join(base_dir, "register_prednet.py"), script_folder) shutil.copy(os.path.join(base_dir, "batch_scoring.py"), script_folder) shutil.copy(os.path.join(base_dir, "train_clf.py"), script_folder) shutil.copy(os.path.join(base_dir, "register_clf.py"), script_folder) cpu_compute_name = args.cpu_compute_name cpu_compute_target = AmlCompute(ws, cpu_compute_name) print("found existing compute target: %s" % cpu_compute_name) # use get_status() to get a detailed status for the current cluster. print(cpu_compute_target.get_status().serialize()) # choose a name for your cluster gpu_compute_name = args.gpu_compute_name gpu_compute_target = AmlCompute(workspace=ws, name=gpu_compute_name) print(gpu_compute_target.get_status().serialize()) env = Environment.get(ws, "prednet") # Runconfigs runconfig = RunConfiguration() runconfig.environment = env print("PipelineData object created") # DataReference to where raw data is stored. raw_data = DataReference( datastore=def_blob_store, data_reference_name="raw_data", path_on_datastore=os.path.join("prednet", "data", "raw_data"), ) print("DataReference object created") # Naming the intermediate data as processed_data and assigning it to the # variable processed_data. preprocessed_data = PipelineData("preprocessed_data", datastore=def_blob_store) data_metrics = PipelineData("data_metrics", datastore=def_blob_store) hd_child_cwd = PipelineData("prednet_model_path", datastore=def_blob_store) # prednet_path = PipelineData("outputs", datastore=def_blob_store) scored_data = PipelineData("scored_data", datastore=def_blob_store) model_path = PipelineData("model_path", datastore=def_blob_store) # prepare dataset for training/testing recurrent neural network data_prep = PythonScriptStep( name="prepare_data", script_name="data_preparation.py", arguments=[ "--raw_data", raw_data, "--preprocessed_data", preprocessed_data, "--dataset", dataset, ], inputs=[raw_data], outputs=[preprocessed_data], compute_target=cpu_compute_target, source_directory=script_folder, runconfig=runconfig, allow_reuse=True, ) # data_prep.run_after(video_decoding) print("data_prep step created") est = Estimator( source_directory=script_folder, compute_target=gpu_compute_target, entry_script="train.py", node_count=1, environment_definition=env, ) ps = BayesianParameterSampling({ "--batch_size": choice(1, 2, 4, 10), "--filter_sizes": choice("3, 3, 3", "4, 4, 4", "5, 5, 5"), "--stack_sizes": choice("48, 96, 192", "36, 72, 144", "12, 24, 48"), "--learning_rate": uniform(1e-6, 1e-3), "--lr_decay": uniform(1e-9, 1e-2), "--freeze_layers": choice("0, 1, 2", "1, 2, 3", "0, 1", "1, 2", "2, 3", "0", "3"), # "--fine_tuning": choice("True", "False"), }) hdc = HyperDriveConfig( estimator=est, hyperparameter_sampling=ps, primary_metric_name="val_loss", primary_metric_goal=PrimaryMetricGoal.MINIMIZE, max_total_runs=3, max_concurrent_runs=3, max_duration_minutes=60 * 6, ) train_prednet = HyperDriveStep( "train_w_hyperdrive", hdc, estimator_entry_script_arguments=[ "--preprocessed_data", preprocessed_data, "--remote_execution", "--dataset", dataset, ], inputs=[preprocessed_data], outputs=[hd_child_cwd], metrics_output=data_metrics, allow_reuse=True, ) train_prednet.run_after(data_prep) register_prednet = PythonScriptStep( name="register_prednet", script_name="register_prednet.py", arguments=[ "--data_metrics", data_metrics, ], compute_target=cpu_compute_target, inputs=[data_metrics, hd_child_cwd], source_directory=script_folder, allow_reuse=True, ) register_prednet.run_after(train_prednet) batch_scoring = PythonScriptStep( name="batch_scoring", script_name="batch_scoring.py", arguments=[ "--preprocessed_data", preprocessed_data, "--scored_data", scored_data, "--dataset", dataset, # "--prednet_path", # prednet_path ], compute_target=gpu_compute_target, inputs=[preprocessed_data], outputs=[scored_data], source_directory=script_folder, runconfig=runconfig, allow_reuse=True, ) batch_scoring.run_after(register_prednet) train_clf = PythonScriptStep( name="train_clf", script_name="train_clf.py", arguments=[ "--preprocessed_data", preprocessed_data, "--scored_data", scored_data, "--model_path", model_path ], compute_target=cpu_compute_target, inputs=[preprocessed_data, scored_data], outputs=[model_path], source_directory=script_folder, runconfig=runconfig, allow_reuse=True, ) train_clf.run_after(batch_scoring) register_clf = PythonScriptStep( name="register_clf", script_name="register_clf.py", arguments=["--model_path", model_path], inputs=[model_path], compute_target=cpu_compute_target, source_directory=script_folder, allow_reuse=True, runconfig=runconfig, ) register_clf.run_after(train_clf) pipeline = Pipeline( workspace=ws, steps=[ data_prep, train_prednet, register_prednet, batch_scoring, train_clf, register_clf, ], ) pipeline.validate() pipeline_name = "prednet_" + dataset published_pipeline = pipeline.publish(name=pipeline_name) _ = Schedule.create( workspace=ws, name=pipeline_name + "_sch", pipeline_id=published_pipeline.id, experiment_name=pipeline_name, datastore=def_blob_store, wait_for_provisioning=True, description="Datastore scheduler for Pipeline" + pipeline_name, path_on_datastore=os.path.join("prednet/data/raw_data", dataset, "Train"), polling_interval=60 * 24, ) published_pipeline.submit(ws, pipeline_name)
def main(): load_dotenv() workspace_name = os.environ.get("WORKSPACE_NAME") resource_group = os.environ.get("RESOURCE_GROUP_NAME") subscription_id = os.environ.get("SUBSCRIPTION_ID") tenant_id = os.environ.get("TENANT_ID") app_id = os.environ.get("SP_APP_ID") app_secret = os.environ.get("SP_APP_SECRET") sources_directory_train = os.environ.get("SOURCES_DIR_TRAIN") train_script_path = os.environ.get("TRAIN_SCRIPT_PATH") evaluate_script_path = os.environ.get("EVALUATE_SCRIPT_PATH") generate_report_path = os.environ.get("GENERATE_REPORT_PATH") generate_report_name = os.environ.get("GENERATE_REPORT_NAME") vm_size = os.environ.get("AML_COMPUTE_CLUSTER_GPU_SKU") compute_name = os.environ.get("AML_COMPUTE_CLUSTER_NAME") model_name = os.environ.get("MODEL_NAME") ckpt_path = os.environ.get("MODEL_CHECKPOINT_PATH") build_id = os.environ.get("BUILD_BUILDID") pipeline_name = os.environ.get("TRAINING_PIPELINE_NAME") epis_datastore = os.environ.get("EPIS_DATASTORE") epis_container = os.environ.get("EPIS_CONTAINER") aml_workspace = get_workspace( workspace_name, resource_group, subscription_id, tenant_id, app_id, app_secret) print(aml_workspace) aml_compute = get_compute( aml_workspace, compute_name, vm_size) if aml_compute is not None: print(aml_compute) run_config = RunConfiguration(conda_dependencies=CondaDependencies.create( conda_packages=['numpy==1.18.1', 'pandas', 'tensorflow-gpu==2.0.0'], pip_packages=['azure', 'azureml-core==1.0.60', 'azureml-tensorboard', 'azure-storage==0.36.0', 'tqdm==4.41.1', 'opencv-python==4.1.2.30', 'easydict==1.9', 'matplotlib==3.1.3']) ) run_config.environment.docker.enabled = True run_config.environment.docker.gpu_support = True run_config.environment.docker.base_image = DEFAULT_GPU_IMAGE model_name = PipelineParameter( name="model_name", default_value=model_name) release_id = PipelineParameter( name="release_id", default_value=build_id) train_step = PythonScriptStep( name="Train Model", script_name=train_script_path, compute_target=aml_compute, source_directory=sources_directory_train, arguments=[ "--release_id", release_id, "--model_name", model_name, "--ckpt_path", ckpt_path, "--datastore", epis_datastore, "--storage_container", epis_container, ], runconfig=run_config, allow_reuse=False, ) print("Step Train created") evaluate_step = PythonScriptStep( name="Evaluate Model", script_name=evaluate_script_path, compute_target=aml_compute, source_directory=sources_directory_train, arguments=[ "--release_id", release_id, "--model_name", model_name, "--ckpt_path", ckpt_path, "--datastore", epis_datastore, "--storage_container", epis_container, ], runconfig=run_config, allow_reuse=False, ) print("Step Evaluate created") generate_report_step = PythonScriptStep( name="Generate Report Model", script_name=generate_report_name, compute_target=aml_compute, source_directory=generate_report_path, arguments=[ "--release_id", release_id, "--model_name", model_name, "--ckpt_path", ckpt_path, "--datastore", epis_datastore, "--storage_container", epis_container, ], runconfig=run_config, allow_reuse=False, ) print("Step generate report created") evaluate_step.run_after(train_step) generate_report_step.run_after(evaluate_step) steps = [train_step, evaluate_step, generate_report_step] train_pipeline = Pipeline(workspace=aml_workspace, steps=steps) train_pipeline.validate() published_pipeline = train_pipeline.publish( name=pipeline_name, description="Model training/retraining pipeline", version=build_id ) print(f'Published pipeline: {published_pipeline.name}') print(f'for build {published_pipeline.version}')
def main(): e = Env() print(e.__dict__) # Get Azure machine learning workspace aml_workspace = Workspace.get(name=e.workspace_name, subscription_id=e.subscription_id, resource_group=e.resource_group) print("get_workspace:") print(aml_workspace) # Get Azure machine learning cluster aml_compute = get_compute(aml_workspace, e.compute_name, e.vm_size) if aml_compute is not None: print("aml_compute:") print(aml_compute) # Create a reusable Azure ML environment environment = get_environment(aml_workspace, e.aml_env_name, create_new=e.rebuild_env) # run_config = RunConfiguration() run_config.environment = environment if e.datastore_name: datastore_name = e.datastore_name else: datastore_name = aml_workspace.get_default_datastore().name run_config.environment.environment_variables[ "DATASTORE_NAME"] = datastore_name # NOQA: E501 model_name_param = PipelineParameter(name="model_name", default_value=e.model_name) dataset_version_param = PipelineParameter(name="dataset_version", default_value=e.dataset_version) data_file_path_param = PipelineParameter(name="data_file_path", default_value="none") caller_run_id_param = PipelineParameter(name="caller_run_id", default_value="none") # Get dataset name dataset_name = e.dataset_name # Check to see if dataset exists if dataset_name not in aml_workspace.datasets: raise ValueError( f"can't find dataset {dataset_name} in datastore {datastore_name}") # Create PipelineData to pass data between steps model_data = PipelineData("model_data", datastore=aml_workspace.get_default_datastore()) train_ds = (PipelineData("train_ds", datastore=aml_workspace.get_default_datastore()). as_dataset().parse_delimited_files().register( name="train", create_new_version=True)) test_ds = (PipelineData( "test_ds", datastore=aml_workspace.get_default_datastore()).as_dataset( ).parse_delimited_files().register(name="test", create_new_version=True)) prepare_step = PythonScriptStep( name="Prepare Data", script_name=e.prepare_script_path, compute_target=aml_compute, source_directory=e.sources_directory_train, outputs=[train_ds, test_ds], arguments=[ "--dataset_version", dataset_version_param, "--data_file_path", data_file_path_param, "--dataset_name", dataset_name, "--caller_run_id", caller_run_id_param, "--train_ds", train_ds, "--test_ds", test_ds ], runconfig=run_config, allow_reuse=True, ) print("Step Prepare created") train_step = PythonScriptStep( name="Train Model", script_name=e.train_script_path, compute_target=aml_compute, source_directory=e.sources_directory_train, inputs=[ train_ds.as_named_input("training_data"), test_ds.as_named_input("testing_data") ], outputs=[model_data], arguments=[ "--model_name", model_name_param, "--model_data", model_data ], runconfig=run_config, allow_reuse=False, ) print("Step Train created") evaluate_step = PythonScriptStep( name="Evaluate Model ", script_name=e.evaluate_script_path, compute_target=aml_compute, source_directory=e.sources_directory_train, arguments=[ "--model_name", model_name_param, "--allow_run_cancel", e.allow_run_cancel, ], runconfig=run_config, allow_reuse=False, ) print("Step Evaluate created") register_step = PythonScriptStep( name="Register Model ", script_name=e.register_script_path, compute_target=aml_compute, source_directory=e.sources_directory_train, inputs=[model_data], arguments=[ "--model_name", model_name_param, "--step_input", model_data ], runconfig=run_config, allow_reuse=False, ) print("Step Register created") # Check run_evaluation flag to include or exclude evaluation step. if (e.run_evaluation).lower() == "true": print("Include evaluation step before register step.") evaluate_step.run_after(train_step) register_step.run_after(evaluate_step) steps = [prepare_step, train_step, evaluate_step, register_step] else: print("Exclude evaluation step and directly run register step.") register_step.run_after(train_step) steps = [prepare_step, train_step, register_step] train_pipeline = Pipeline(workspace=aml_workspace, steps=steps) train_pipeline._set_experiment_name train_pipeline.validate() published_pipeline = train_pipeline.publish( name=e.pipeline_name, description="Model training/retraining pipeline", version=e.build_id) print(f"Published pipeline: {published_pipeline.name}") print(f"for build {published_pipeline.version}")
def main(): env = EnvironmentVariables() args = add_arguments() workspace = get_workspace() cpu_cluster_name = env.cpu_cluster_name compute = get_or_create_compute(workspace, cpu_cluster_name, env.compute_vm_size, env.max_nodes) environment = Environment.load_from_directory(env.sources_directory_train) environment.register(workspace) run_configuration = RunConfiguration() run_configuration.environment = environment model_name_param = PipelineParameter(name="model_name", default_value=env.model_name) build_id_param = PipelineParameter(name="build_id", default_value=env.build_id) should_tune_hyperparameters_param = PipelineParameter( name="should_tune_hyperparameters", default_value=env.should_tune_hyperparameters) parallelism_level_param = PipelineParameter( name="parallelism_level", default_value=env.parallelism_level) force_register_param = PipelineParameter(name="force_register", default_value=env.force_register) datastore = get_datastore() dataset_name = env.dataset_name dataset_path = env.dataset_path print( f"Creating new dataset version for {dataset_name} in datastore {datastore} from file {dataset_path}" ) temp_dataset = Dataset.Tabular.from_delimited_files(path=[(datastore, dataset_path)]) dataset = temp_dataset.register(workspace=workspace, name=dataset_name, description=dataset_name, tags={'format': 'CSV'}, create_new_version=True) train_output = PipelineData('train_output', output_name='train_output', datastore=datastore) train_step = PythonScriptStep( name="Train model", compute_target=compute, script_name=env.train_script_name, runconfig=run_configuration, inputs=[dataset.as_named_input('training')], outputs=[train_output], arguments=[ "--build_id", build_id_param, "--model_name", model_name_param, "--parallelism_level", parallelism_level_param, "--should_tune_hyperparameters", should_tune_hyperparameters_param ], allow_reuse=False) evaluate_step = PythonScriptStep(name="Evaluate model", compute_target=compute, script_name=env.evaluate_script_name, runconfig=run_configuration, inputs=[train_output], arguments=[ "--build_id", build_id_param, "--model_name", model_name_param, "--train_output", train_output, "--force_register", force_register_param ], allow_reuse=False) register_step = PythonScriptStep(name="Register model", compute_target=compute, script_name=env.register_script_name, runconfig=run_configuration, inputs=[train_output], arguments=[ "--build_id", build_id_param, "--model_name", model_name_param, "--train_output", train_output ], allow_reuse=False) evaluate_step.run_after(train_step) register_step.run_after(evaluate_step) steps = [train_step, evaluate_step, register_step] train_pipeline = Pipeline(workspace=workspace, steps=steps) train_pipeline.validate() published_pipeline = train_pipeline.publish( name=env.pipeline_name, description="Train/Eval/Register if better pipeline", version=env.build_id) output_file_name = args.output_file_name if output_file_name: with open(output_file_name, "w") as output_file: output_file.write(published_pipeline.id) print( f"Published pipeline {published_pipeline.name} for build {published_pipeline.version}" )
def main(): e = Env() # Get Azure machine learning workspace aml_workspace = Workspace.get(name=e.workspace_name, subscription_id=e.subscription_id, resource_group=e.resource_group) print(f"get_workspace: {aml_workspace}") # Get Azure machine learning cluster aml_compute = get_compute(aml_workspace, e.compute_name, e.vm_size) if aml_compute is not None: print(f"aml_compute: {aml_compute}") # Prepare the dataset input data_store = aml_workspace.get_default_datastore() print("data_store: %s" % data_store.name) train_ds_name = e.dataset_name train_data_path = e.datafile_path sources_directory_train = e.sources_directory_train pipeline_name = e.pipeline_name build_id = e.build_id # Register the train dataset if (train_ds_name not in aml_workspace.datasets): train_path_on_datastore = train_data_path # +'/*.csv' train_ds_data_path = [(data_store, train_path_on_datastore)] train_ds = Dataset.File.from_files(path=train_ds_data_path, validate=False) train_ds = train_ds.register(workspace=aml_workspace, name=train_ds_name, description='train data', tags={'format': 'CSV'}, create_new_version=True) else: train_ds = Dataset.get_by_name(aml_workspace, train_ds_name) train_input = train_ds.as_named_input('train_input') # Conda environment environment = Environment.from_conda_specification( "myenv", os.path.join(sources_directory_train, "conda_dependencies.yml")) # Logging into Azure Application Insights env = { "APPLICATIONINSIGHTS_CONNECTION_STRING": e.applicationinsights_connection_string } env['AZUREML_FLUSH_INGEST_WAIT'] = '' env['DISABLE_ENV_MISMATCH'] = True environment.environment_variables = env from ff.util.helper import build_parallel_run_config # PLEASE MODIFY the following three settings based on your compute and # experiment timeout. process_count_per_node = 6 node_count = 3 # this timeout(in seconds) is inline with AutoML experiment timeout or (no # of iterations * iteration timeout) run_invocation_timeout = 3700 parallel_run_config = build_parallel_run_config(sources_directory_train, environment, aml_compute, node_count, process_count_per_node, run_invocation_timeout) from azureml.pipeline.core import PipelineData output_dir = PipelineData(name="training_output", datastore=data_store) #from azureml.contrib.pipeline.steps import ParallelRunStep from azureml.pipeline.steps import ParallelRunStep parallel_run_step = ParallelRunStep( name="many-models-training", parallel_run_config=parallel_run_config, allow_reuse=False, inputs=[train_input], output=output_dir # models=[], # arguments=[] ) pipeline = Pipeline(workspace=aml_workspace, steps=parallel_run_step) pipeline._set_experiment_name pipeline.validate() published_pipeline = pipeline.publish(name=pipeline_name, description="FF AutomML pipeline", version=build_id) print(f'Published pipeline: {published_pipeline.name}') print(f'for build {published_pipeline.version}')
def create_experiment_config(workspace): ######################################## ### Creating data prep Pipeline Step ### ######################################## # Load settings print("Loading settings") data_prep_step_path = os.path.join("steps", "data_prep") with open(os.path.join(data_prep_step_path, "step.json")) as f: data_prep_settings = json.load(f) # Setup datasets of first step print("Setting up datasets") data_prep_input = Dataset.get_by_name(workspace=workspace, name=data_prep_settings.get( "dataset_input_name", None)).as_named_input( data_prep_settings.get( "dataset_input_name", None)).as_mount() data_prep_output = PipelineData( name=data_prep_settings.get("dataset_output_name", None), datastore=Datastore(workspace=workspace, name=data_prep_settings.get( "datastore_output_name", "workspaceblobstore")), output_mode="mount").as_dataset() # Uncomment next lines, if you want to register intermediate dataset #data_prep_output.register( # name=data_prep_settings.get("dataset_output_name", None), # create_new_version=True #) # Create conda dependencies print("Creating conda dependencies") data_prep_dependencies = CondaDependencies.create( pip_packages=data_prep_settings.get("pip_packages", []), conda_packages=data_prep_settings.get("conda_packages", []), python_version=data_prep_settings.get("python_version", "3.6.2")) # Create run configuration print("Creating RunConfiguration") data_prep_run_config = RunConfiguration( conda_dependencies=data_prep_dependencies, framework=data_prep_settings.get("framework", "Python")) # Loading compute target print("Loading ComputeTarget") data_prep_compute_target = ComputeTarget(workspace=workspace, name=data_prep_settings.get( "compute_target_name", None)) # Create python step print("Creating Step") data_prep = PythonScriptStep( name=data_prep_settings.get("step_name", None), script_name=data_prep_settings.get("script_name", None), arguments=data_prep_settings.get("arguments", []), compute_target=data_prep_compute_target, runconfig=data_prep_run_config, inputs=[data_prep_input], outputs=[data_prep_output], params=data_prep_settings.get("parameters", []), source_directory=data_prep_step_path, allow_reuse=data_prep_settings.get("allow_reuse", True), version=data_prep_settings.get("version", None), ) ############################################### ### Creating data model train Pipeline Step ### ############################################### # Load settings print("Loading settings") model_train_step_path = os.path.join("steps", "model_train") with open(os.path.join(model_train_step_path, "step.json")) as f: model_train_settings = json.load(f) hyperparameter_sampling_settings = model_train_settings.get( "hyperparameter_sampling", {}) # Setup datasets of first step print("Setting up datasets") model_train_input = data_prep_output.as_named_input( name=model_train_settings.get("dataset_input_name", None)) model_train_output = PipelineData( name=model_train_settings.get("dataset_output_name", None), datastore=Datastore(workspace=workspace, name=model_train_settings.get( "datastore_output_name", None)), output_mode="mount", ).as_dataset() # Uncomment next lines, if you want to register intermediate dataset #model_train_output.register( # name=model_train_settings.get("dataset_output_name", None), # create_new_version=True #) # Create conda dependencies print("Creating conda dependencies") model_train_dependencies = CondaDependencies.create( pip_packages=model_train_settings.get("pip_packages", []), conda_packages=model_train_settings.get("conda_packages", []), python_version=model_train_settings.get("python_version", "3.6.2")) # Create run configuration print("Creating RunConfiguration") model_train_run_config = RunConfiguration( conda_dependencies=model_train_dependencies, framework=model_train_settings.get("framework", "Python")) # Loading compute target print("Loading ComputeTarget") model_train_compute_target = ComputeTarget(workspace=workspace, name=model_train_settings.get( "compute_target_name", None)) # Create distributed training backend print("Creating distributed training backend") distributed_training_backend = get_distributed_backend( backend_name=model_train_settings.get("distributed_backend", None)) # Create Estimator for Training print("Creating Estimator for training") model_train_estimator = Estimator( source_directory=model_train_step_path, entry_script=model_train_settings.get("script_name", None), environment_variables=model_train_settings.get("parameters", None), compute_target=model_train_compute_target, node_count=model_train_settings.get("node_count", None), distributed_training=distributed_training_backend, conda_packages=model_train_settings.get("conda_packages", None), pip_packages=model_train_settings.get("pip_packages", None), ) try: # Create parameter sampling print("Creating Parameter Sampling") parameter_dict = {} parameters = hyperparameter_sampling_settings.get( "parameters", {}) if "parameters" in hyperparameter_sampling_settings else {} for parameter_name, parameter_details in parameters.items(): parameter_distr = get_parameter_distribution( distribution=parameter_details.get("distribution", None), **parameter_details.get("settings", {})) parameter_dict[f"--{parameter_name}"] = parameter_distr model_train_ps = get_parameter_sampling( sampling_method=hyperparameter_sampling_settings.get( "method", None), parameter_dict=parameter_dict) # Get Policy definition policy_settings = hyperparameter_sampling_settings.get("policy", {}) kwargs = { key: value for key, value in policy_settings.items() if key not in ["policy_method", "evaluation_interval", "delay_evaluation"] } # Create termination policy print("Creating early termination policy") model_train_policy = get_policy( policy_method=policy_settings.get("method", ""), evaluation_interval=policy_settings.get("evaluation_interval", None), delay_evaluation=policy_settings.get("delay_evaluation", None), **kwargs) # Create HyperDriveConfig print("Creating HyperDriveConfig") model_train_hyperdrive_config = HyperDriveConfig( estimator=model_train_estimator, hyperparameter_sampling=model_train_ps, policy=model_train_policy, primary_metric_name=hyperparameter_sampling_settings.get( "primary_metric", None), primary_metric_goal=PrimaryMetricGoal.MINIMIZE if "min" in hyperparameter_sampling_settings.get( "primary_metric_goal", None) else PrimaryMetricGoal.MAXIMIZE, max_total_runs=hyperparameter_sampling_settings.get( "max_total_runs", 1), max_concurrent_runs=hyperparameter_sampling_settings.get( "max_concurrent_runs", 1), max_duration_minutes=hyperparameter_sampling_settings.get( "max_duration_minutes", None)) # Create HyperDriveStep print("Creating HyperDriveStep") model_train = HyperDriveStep( name=model_train_settings.get("step_name", None), hyperdrive_config=model_train_hyperdrive_config, estimator_entry_script_arguments=model_train_settings.get( "arguments", None), inputs=[model_train_input], outputs=[model_train_output], allow_reuse=model_train_settings.get("allow_reuse", True), version=model_train_settings.get("version", True)) except: print("Not all required parameters specified for HyperDrive step") # Create EstimatorStep print("Creating EstimatorStep") model_train = EstimatorStep( name=model_train_settings.get("step_name", None), estimator=model_train_estimator, estimator_entry_script_arguments=model_train_settings.get( "arguments", None), inputs=[model_train_input], outputs=[model_train_output], compute_target=model_train_compute_target, allow_reuse=model_train_settings.get("allow_reuse", True), version=model_train_settings.get("version", True)) ######################### ### Creating Pipeline ### ######################### # Create Pipeline print("Creating Pipeline") pipeline = Pipeline( workspace=workspace, steps=[model_train], description="Training Pipeline", ) # Validate pipeline print("Validating pipeline") pipeline.validate() return pipeline
def main(): e = Env() # Get Azure machine learning workspace aml_workspace = Workspace.get( name=e.workspace_name, subscription_id=e.subscription_id, resource_group=e.resource_group, ) print(f"get_workspace:{aml_workspace}") # Get Azure machine learning cluster aml_compute = get_compute(aml_workspace, e.compute_name, e.vm_size) if aml_compute is not None: print(f"aml_compute:{aml_compute}") # Create a reusable Azure ML environment environment = get_environment( aml_workspace, e.aml_env_name, conda_dependencies_file=e.aml_env_train_conda_dep_file, create_new=e.rebuild_env, ) # run_config = RunConfiguration() run_config.environment = environment if e.datastore_name: datastore_name = e.datastore_name else: datastore_name = aml_workspace.get_default_datastore().name run_config.environment.environment_variables[ "DATASTORE_NAME"] = datastore_name # NOQA: E501 # datastore and dataset names are fixed for this pipeline, however # data_file_path can be specified for registering new versions of dataset # Note that AML pipeline parameters don't take empty string as default, "" won't work # NOQA: E501 model_name_param = PipelineParameter( name="model_name", default_value=e.model_name) # NOQA: E501 data_file_path_param = PipelineParameter( name="data_file_path", default_value="nopath") # NOQA: E501 ml_params = PipelineParameter(name="ml_params", default_value="default") # NOQA: E501 # Create a PipelineData to pass data between steps pipeline_data = PipelineData( "pipeline_data", datastore=aml_workspace.get_default_datastore()) train_step = PythonScriptStep( name="Train Model", script_name="train/train_aml.py", compute_target=aml_compute, source_directory=e.sources_directory_train, outputs=[pipeline_data], arguments=[ "--model_name", model_name_param, "--step_output", pipeline_data, "--data_file_path", data_file_path_param, "--dataset_name", e.processed_dataset_name, "--datastore_name", datastore_name, "--ml_params", ml_params, ], runconfig=run_config, allow_reuse=True, ) print("Step Train created") evaluate_step = PythonScriptStep( name="Evaluate Model ", script_name="evaluate/evaluate_model.py", compute_target=aml_compute, source_directory=e.sources_directory_train, arguments=[ "--model_name", model_name_param, "--ml_params", ml_params, ], runconfig=run_config, allow_reuse=False, ) print("Step Evaluate created") register_step = PythonScriptStep( name="Register Model ", script_name="register/register_model.py", compute_target=aml_compute, source_directory=e.sources_directory_train, inputs=[pipeline_data], arguments=[ "--model_name", model_name_param, "--step_input", pipeline_data, "--ml_params", ml_params, ], runconfig=run_config, allow_reuse=False, ) print("Step Register created") evaluate_step.run_after(train_step) register_step.run_after(evaluate_step) steps = [train_step, evaluate_step, register_step] train_pipeline = Pipeline(workspace=aml_workspace, steps=steps) train_pipeline._set_experiment_name train_pipeline.validate() published_pipeline = train_pipeline.publish( name=e.training_pipeline_name, description="Model training/retraining pipeline", version=e.build_id, ) print(f"Published pipeline: {published_pipeline.name}") print(f"for build {published_pipeline.version}")
def main(): e = Env() print(e.workspace_name) svc_pr = ServicePrincipalAuthentication( tenant_id=os.environ.get("TENANT_ID"), service_principal_id=os.environ.get("AZURE_SP_ID"), service_principal_password=os.environ.get("AZURE_SP_PASSWORD")) # Get Azure machine learning workspace ws = Workspace.get(name=os.environ.get("WORKSPACE_NAME"), subscription_id=os.environ.get("SUBSCRIPTION_ID"), resource_group=os.environ.get("AZURE_RESOURCE_GROUP"), auth=svc_pr) #ex = Experiment(ws, 'iris-pipeline') #ex.archive() print("get_workspace:") print(ws) ws.write_config(path="", file_name="config.json") print("writing config.json.") # Get Azure machine learning cluster aml_compute = get_compute(ws, "train-cluster", "STANDARD_DS2_V2") if aml_compute is not None: print("aml_compute:") print(aml_compute) run_config = RunConfiguration(conda_dependencies=CondaDependencies.create( conda_packages=[ 'numpy', 'pandas', 'scikit-learn', 'tensorflow', 'keras' ], pip_packages=[ 'azure', 'azureml-core', 'azureml-pipeline', 'azure-storage', 'azure-storage-blob', 'azureml-dataprep' ])) run_config.environment.docker.enabled = True ######### TRAIN ################ train_step = PythonScriptStep( name="Train", source_directory="models/python/iris/train", script_name="train.py", compute_target=aml_compute, arguments=[], runconfig=run_config, allow_reuse=False, ) print("Train Step created") ######### EVALUATE ################ evaluate_step = PythonScriptStep( name="Evaluate", source_directory="models/python/iris/evaluate", script_name="evaluate.py", compute_target=aml_compute, arguments=[], runconfig=run_config, allow_reuse=False, ) print("Evaluate Step created") ######### REGISTER ################ register_step = PythonScriptStep( name="Register", source_directory="models/python/iris/register", script_name="register.py", compute_target=aml_compute, arguments=[], runconfig=run_config, allow_reuse=False, ) print("Register Step created") #evaluate_step.run_after(train_step) register_step.run_after(train_step) steps = [train_step, register_step] train_pipeline = Pipeline(workspace=ws, steps=steps) train_pipeline._set_experiment_name train_pipeline.validate() published_pipeline = train_pipeline.publish(name="iris-pipeline", description="") print(f'Published pipeline: {published_pipeline.name}') print(f'for build {published_pipeline.version}') pipeline_parameters = {"model_name": "iris-pipeline-param"} run = published_pipeline.submit(ws, "iris-pipeline-experiment", pipeline_parameters)
def main(): # e = Env() # print(e.workspace_name) # svc_pr = ServicePrincipalAuthentication( # tenant_id=os.environ.get("TENANT_ID"), # service_principal_id=os.environ.get("AZURE_SP_ID"), # service_principal_password=os.environ.get("AZURE_SP_PASSWORD")) # # Get Azure machine learning workspace # ws = Workspace.get( # name=os.environ.get("WORKSPACE_NAME"), # subscription_id=os.environ.get("SUBSCRIPTION_ID"), # resource_group=os.environ.get("AZURE_RESOURCE_GROUP") # ,auth=svc_pr # ) #ex = Experiment(ws, 'iris-pipeline') #ex.archive() ws = Workspace.from_config() print("get_workspace:") print(ws) # ws.write_config(path="", file_name="config.json") print("writing config.json.") # Get Azure machine learning cluster aml_compute = get_compute(ws, compute_name='cpu1', vm_size='STANDARD_D1') # Data stores data_dir = "pipelines/modelout" def_data_store = ws.get_default_datastore() output_dir = PipelineData(name="scores", datastore=def_data_store, output_path_on_compute=data_dir) if aml_compute is not None: print("aml_compute:") print(aml_compute) run_config = RunConfiguration(conda_dependencies=CondaDependencies.create( conda_packages=[ 'numpy', 'pandas', 'scikit-learn', 'tensorflow', 'keras' ], pip_packages=[ 'azure', 'azureml-core', 'azureml-pipeline', 'azure-storage', 'azure-storage-blob', 'azureml-dataprep' ])) run_config.environment.docker.enabled = True ######### TRAIN ################ # model_path = "outputs/model.pkl" # data_dir = "./outputs/pipelines/modelout/" # train_step = PythonScriptStep( # name="Train", # source_directory="./", # script_name="train.py", # compute_target=aml_compute, # arguments=["--model_path", model_path, # "--data_dir",data_dir], # outputs=[output_dir], # runconfig=run_config, # allow_reuse=False, # ) # print("Train Step created") ######### REGISTER ################ # model_path = "trained-model/model.pkl" # register_step = PythonScriptStep( # name="Register", # source_directory="./", # script_name="register.py", # compute_target=aml_compute, # arguments=["--model_path", model_path], # inputs=[output_dir], # runconfig=run_config, # allow_reuse=False, # ) # print("Register Step created") ######### DEPLOY ################ # print("Uploading entry script") # score_path = "./deploy/deploy.py" # datastore = ws.get_default_datastore() # datastore.upload_files(files = [model_path], target_path = 'deploy/', overwrite = True,show_progress = True) # print("done!") deploy_step = PythonScriptStep( name="Deploy", source_directory="./deploy", script_name="deploy.py", compute_target=aml_compute, arguments=[], inputs=[], runconfig=run_config, allow_reuse=False, ) print("Deploy Step created") #evaluate_step.run_after(train_step) # register_step.run_after(deploy_step) steps = [deploy_step] train_pipeline = Pipeline(workspace=ws, steps=steps) train_pipeline._set_experiment_name train_pipeline.validate() published_pipeline = train_pipeline.publish(name="aks-deployment-pipeline", description="") print(f'Published pipeline: {published_pipeline.name}') print(f'for build {published_pipeline.version}') pipeline_parameters = {"model_name": "sklearn_regression_model.pkl"} run = published_pipeline.submit(ws, "compute-instance-pipeline-experiment", pipeline_parameters)
def main(): load_dotenv() workspace_name = os.environ.get("BASE_NAME") + "-AML-WS" resource_group = os.environ.get("BASE_NAME") + "-AML-RG" subscription_id = os.environ.get("SUBSCRIPTION_ID") tenant_id = os.environ.get("TENANT_ID") app_id = os.environ.get("SP_APP_ID") app_secret = os.environ.get("SP_APP_SECRET") sources_directory_train = os.environ.get("SOURCES_DIR_TRAIN") train_script_path = os.environ.get("TRAIN_SCRIPT_PATH") vm_size = os.environ.get("AML_COMPUTE_CLUSTER_CPU_SKU") compute_name = os.environ.get("AML_COMPUTE_CLUSTER_NAME") model_name = os.environ.get("MODEL_NAME") build_id = os.environ.get("BUILD_BUILDID") pipeline_name = os.environ.get("TRAINING_PIPELINE_NAME") data_path = os.environ.get("DATA_PATH_DATASTORE") model_data_path = os.environ.get("MODEL_DATA_PATH_DATASTORE") # Get Azure machine learning workspace aml_workspace = get_workspace(workspace_name, resource_group, subscription_id, tenant_id, app_id, app_secret) print(aml_workspace) # Get Azure machine learning cluster aml_compute = get_compute(aml_workspace, compute_name, vm_size) if aml_compute is not None: print(aml_compute) model_name = PipelineParameter(name="model_name", default_value=model_name) release_id = PipelineParameter(name="release_id", default_value="0") ds = aml_workspace.get_default_datastore() dataref_folder = ds.path(data_path).as_mount() model_dataref = ds.path(model_data_path).as_mount() # NEED those two folders mounted on datastore and env variables specified in variable groups #ds.upload(src_dir='./VOCdevkit', target_path='VOCdevkit', overwrite=True, show_progress=True) #ds.upload(src_dir='./model_data', target_path='VOCmodel_data', overwrite=True, show_progress=True) yoloEstimator = TensorFlow( source_directory=sources_directory_train + '/training', compute_target=aml_compute, entry_script=train_script_path, pip_packages=[ 'keras', 'pillow', 'matplotlib', 'onnxmltools', 'keras2onnx==1.5.1' ], # recent versions of keras2onnx give conversion issues use_gpu=True, framework_version='1.13') train_step = EstimatorStep(name="Train & Convert Model", estimator=yoloEstimator, estimator_entry_script_arguments=[ "--release_id", release_id, "--model_name", model_name, "--data_folder", dataref_folder, "--model_path", model_dataref ], runconfig_pipeline_params=None, inputs=[dataref_folder, model_dataref], compute_target=aml_compute, allow_reuse=False) print("Step Train & Convert created") train_pipeline = Pipeline(workspace=aml_workspace, steps=[train_step]) train_pipeline.validate() published_pipeline = train_pipeline.publish( name=pipeline_name, description="Model training/retraining pipeline", version=build_id) print(f'Published pipeline: {published_pipeline.name}') print(f'for build {published_pipeline.version}')
def main(): e = Env() aml_workspace = Workspace.get( name=e.workspace_name, subscription_id=e.subscription_id, resource_group=e.resource_group ) print("get_workspace:") print(aml_workspace) aml_compute = get_compute( aml_workspace, e.compute_name, e.vm_size) if aml_compute is not None: print("aml_compute:") print(aml_compute) environment = get_environment( aml_workspace, e.aml_env_name, create_new=e.rebuild_env) run_config = RunConfiguration() run_config.environment = environment if (e.datastore_name): datastore_name = e.datastore_name else: datastore_name = aml_workspace.get_default_datastore().name run_config.environment.environment_variables["DATASTORE_NAME"] \ = datastore_name dataset_name = e.dataset_name file_name = e.file_name datastore = Datastore.get(aml_workspace, datastore_name) if (dataset_name not in aml_workspace.datasets): raise Exception("Could not find dataset at \"%s\"." % dataset_name) else: dataset = Dataset.get_by_name(aml_workspace, name=dataset_name) dataset.download(target_path='.', overwrite=True) datastore.upload_files([file_name], target_path=dataset_name, overwrite=True) raw_data_file = DataReference(datastore=datastore, data_reference_name="Raw_Data_File", path_on_datastore=dataset_name + '/' + file_name) clean_data_file = PipelineParameter(name="clean_data_file", default_value="/clean_data.csv") clean_data_folder = PipelineData("clean_data_folder", datastore=datastore) prepDataStep = PythonScriptStep(name="Prepare Data", source_directory=e.sources_directory_train, script_name=e.data_prep_script_path, arguments=["--raw_data_file", raw_data_file, "--clean_data_folder", clean_data_folder, "--clean_data_file", clean_data_file], inputs=[raw_data_file], outputs=[clean_data_folder], compute_target=aml_compute, allow_reuse=False) print("Step Prepare Data created") new_model_file = PipelineParameter(name="new_model_file ", default_value='/' + e.model_name + '.pkl') new_model_folder = PipelineData("new_model_folder", datastore=datastore) est = SKLearn(source_directory=e.sources_directory_train, entry_script=e.train_script_path, pip_packages=['azureml-sdk', 'scikit-learn==0.20.3', 'azureml-dataprep[pandas,fuse]>=1.1.14'], compute_target=aml_compute) trainingStep = EstimatorStep( name="Model Training", estimator=est, estimator_entry_script_arguments=["--clean_data_folder", clean_data_folder, "--new_model_folder", new_model_folder, "--clean_data_file", clean_data_file.default_value, "--new_model_file", new_model_file.default_value], runconfig_pipeline_params=None, inputs=[clean_data_folder], outputs=[new_model_folder], compute_target=aml_compute, allow_reuse=False) print("Step Train created") model_name_param = PipelineParameter(name="model_name", default_value=e.model_name) evaluateStep = PythonScriptStep( name="Evaluate Model", source_directory=e.sources_directory_train, script_name=e.evaluate_script_path, arguments=["--model_name", model_name_param], compute_target=aml_compute, allow_reuse=False) print("Step Evaluate created") registerStep = PythonScriptStep( name="Register Model", source_directory=e.sources_directory_train, script_name=e.register_script_path, arguments=["--new_model_folder", new_model_folder, "--new_model_file", new_model_file, "--model_name", model_name_param], inputs=[new_model_folder], compute_target=aml_compute, allow_reuse=False) print("Step Register created") if ((e.run_evaluation).lower() == 'true'): print("Include evaluation step before register step.") trainingStep.run_after(prepDataStep) evaluateStep.run_after(trainingStep) registerStep.run_after(evaluateStep) else: print("Exclude evaluation step and directly run register step.") trainingStep.run_after(prepDataStep) registerStep.run_after(trainingStep) pipeline = Pipeline(workspace=aml_workspace, steps=[registerStep]) pipeline.validate() print("Pipeline is built") pipeline._set_experiment_name published_pipeline = pipeline.publish( name=e.pipeline_name, description="Predict Employee Retention Model training pipeline", version=e.build_id ) print(f'Published pipeline: {published_pipeline.name}') print(f'for build {published_pipeline.version}')
destination=def_blob_store) train_step = PythonScriptStep( script_name=train_entry_point, source_directory=train_source_dir, arguments=["--input_data", ds_input], compute_target=compute_target, # , "--training_results", training_results runconfig=aml_run_config, allow_reuse=False) compare_models = [train_step] # Build the pipeline pipeline1 = Pipeline(workspace=ws, steps=train_step) pipeline1.validate() print("Pipeline validation complete") # Submit the pipeline to be run pipeline_run1 = Experiment(ws, 'Titanic_Pipeline_Notebook').submit(pipeline1) pipeline_run1.wait_for_completion() # RunDetails(pipeline_run1).show() step_runs = pipeline_run1.get_children() for step_run in step_runs: status = step_run.get_status() print('Script:', step_run.name, 'status:', status) # Change this if you want to see details even if the Step has succeeded. if status == "Failed":
def main(): e = Env() # Get Azure machine learning workspace aml_workspace = Workspace.get(name=e.workspace_name, subscription_id=e.subscription_id, resource_group=e.resource_group) print("get_workspace:") print(aml_workspace) # Get Azure machine learning cluster aml_compute = get_compute(aml_workspace, e.compute_name, e.vm_size) if aml_compute is not None: print("aml_compute:") print(aml_compute) # Create a run configuration environment conda_deps_file = "diabetes_regression/training_dependencies.yml" conda_deps = CondaDependencies(conda_deps_file) run_config = RunConfiguration(conda_dependencies=conda_deps) run_config.environment.docker.enabled = True config_envvar = {} if (e.collection_uri is not None and e.teamproject_name is not None): builduri_base = e.collection_uri + e.teamproject_name builduri_base = builduri_base + "/_build/results?buildId=" config_envvar["BUILDURI_BASE"] = builduri_base run_config.environment.environment_variables = config_envvar model_name_param = PipelineParameter(name="model_name", default_value=e.model_name) build_id_param = PipelineParameter(name="build_id", default_value=e.build_id) dataset_name = "" if (e.datastore_name is not None and e.datafile_name is not None): dataset_name = e.dataset_name datastore = Datastore.get(aml_workspace, e.datastore_name) data_path = [(datastore, e.datafile_name)] dataset = Dataset.Tabular.from_delimited_files(path=data_path) dataset.register(workspace=aml_workspace, name=e.dataset_name, description="dataset with training data", create_new_version=True) train_step = PythonScriptStep( name="Train Model", script_name=e.train_script_path, compute_target=aml_compute, source_directory=e.sources_directory_train, arguments=[ "--build_id", build_id_param, "--model_name", model_name_param, "--dataset_name", dataset_name, ], runconfig=run_config, allow_reuse=False, ) print("Step Train created") evaluate_step = PythonScriptStep( name="Evaluate Model ", script_name=e.evaluate_script_path, compute_target=aml_compute, source_directory=e.sources_directory_train, arguments=[ "--build_id", build_id_param, "--model_name", model_name_param, "--allow_run_cancel", e.allow_run_cancel, ], runconfig=run_config, allow_reuse=False, ) print("Step Evaluate created") register_step = PythonScriptStep( name="Register Model ", script_name=e.register_script_path, compute_target=aml_compute, source_directory=e.sources_directory_train, arguments=[ "--build_id", build_id_param, "--model_name", model_name_param, ], runconfig=run_config, allow_reuse=False, ) print("Step Register created") # Check run_evaluation flag to include or exclude evaluation step. if ((e.run_evaluation).lower() == 'true'): print("Include evaluation step before register step.") evaluate_step.run_after(train_step) register_step.run_after(evaluate_step) steps = [train_step, evaluate_step, register_step] else: print("Exclude evaluation step and directly run register step.") register_step.run_after(train_step) steps = [train_step, register_step] train_pipeline = Pipeline(workspace=aml_workspace, steps=steps) train_pipeline._set_experiment_name train_pipeline.validate() published_pipeline = train_pipeline.publish( name=e.pipeline_name, description="Model training/retraining pipeline", version=e.build_id) print(f'Published pipeline: {published_pipeline.name}') print(f'for build {published_pipeline.version}')
def main(): e = Env() # Get Azure machine learning workspace aml_workspace = Workspace.get( name=e.workspace_name, subscription_id=e.subscription_id, resource_group=e.resource_group, ) print("get_workspace:") print(aml_workspace) # Get Azure machine learning cluster aml_compute = get_compute(aml_workspace, e.compute_name, e.vm_size) if aml_compute is not None: print("aml_compute:") print(aml_compute) # Create a reusable Azure ML environment environment = get_environment( aml_workspace, e.aml_env_name, conda_dependencies_file=e.aml_env_train_conda_dep_file, create_new=e.rebuild_env, ) # run_config = RunConfiguration() run_config.environment = environment if e.datastore_name: datastore_name = e.datastore_name else: datastore_name = aml_workspace.get_default_datastore().name run_config.environment.environment_variables[ "DATASTORE_NAME"] = datastore_name # NOQA: E501 model_name_param = PipelineParameter( name="model_name", default_value=e.model_name) # NOQA: E501 dataset_version_param = PipelineParameter(name="dataset_version", default_value=e.dataset_version) data_file_path_param = PipelineParameter(name="data_file_path", default_value="none") caller_run_id_param = PipelineParameter(name="caller_run_id", default_value="none") # NOQA: E501 # Get dataset name dataset_name = e.dataset_name # Check to see if dataset exists if dataset_name not in aml_workspace.datasets: # This call creates an example CSV from sklearn sample data. If you # have already bootstrapped your project, you can comment this line # out and use your own CSV. create_sample_data_csv() # Use a CSV to read in the data set. file_name = "automobile.csv" if not os.path.exists(file_name): raise Exception( 'Could not find CSV dataset at "%s". If you have bootstrapped your project, you will need to provide a CSV.' # NOQA: E501 % file_name) # NOQA: E501 # Upload file to default datastore in workspace datatstore = Datastore.get(aml_workspace, datastore_name) target_path = "training-data/" datatstore.upload_files( files=[file_name], target_path=target_path, overwrite=True, show_progress=False, ) # Register dataset path_on_datastore = os.path.join(target_path, file_name) dataset = Dataset.Tabular.from_delimited_files( path=(datatstore, path_on_datastore)) dataset = dataset.register( workspace=aml_workspace, name=dataset_name, description="automobile training data", tags={"format": "CSV"}, create_new_version=True, ) # Create a PipelineData to pass data between steps pipeline_data = PipelineData( "pipeline_data", datastore=aml_workspace.get_default_datastore()) train_step = PythonScriptStep( name="Train Model", script_name=e.train_script_path, compute_target=aml_compute, source_directory=e.sources_directory_train, outputs=[pipeline_data], arguments=[ "--model_name", model_name_param, "--step_output", pipeline_data, "--dataset_version", dataset_version_param, "--data_file_path", data_file_path_param, "--caller_run_id", caller_run_id_param, "--dataset_name", dataset_name, ], runconfig=run_config, allow_reuse=True, ) print("Step Train created") evaluate_step = PythonScriptStep( name="Evaluate Model ", script_name=e.evaluate_script_path, compute_target=aml_compute, source_directory=e.sources_directory_train, arguments=[ "--model_name", model_name_param, "--allow_run_cancel", e.allow_run_cancel, ], runconfig=run_config, allow_reuse=False, ) print("Step Evaluate created") register_step = PythonScriptStep( name="Register Model ", script_name=e.register_script_path, compute_target=aml_compute, source_directory=e.sources_directory_train, inputs=[pipeline_data], arguments=[ "--model_name", model_name_param, "--step_input", pipeline_data, ], # NOQA: E501 runconfig=run_config, allow_reuse=False, ) print("Step Register created") # Check run_evaluation flag to include or exclude evaluation step. if (e.run_evaluation).lower() == "true": print("Include evaluation step before register step.") evaluate_step.run_after(train_step) register_step.run_after(evaluate_step) steps = [train_step, evaluate_step, register_step] else: print("Exclude evaluation step and directly run register step.") register_step.run_after(train_step) steps = [train_step, register_step] train_pipeline = Pipeline(workspace=aml_workspace, steps=steps) train_pipeline._set_experiment_name train_pipeline.validate() published_pipeline = train_pipeline.publish( name=e.pipeline_name, description="Model training/retraining pipeline", version=e.build_id, ) print(f"Published pipeline: {published_pipeline.name}") print(f"for build {published_pipeline.version}")
def build_pipeline(dataset, ws, config): print("building pipeline for dataset %s in workspace %s" % (dataset, ws.name)) hostname = socket.gethostname() if hostname == 'wopauliNC6': base_dir = '.' else: base_dir = '.' def_blob_store = ws.get_default_datastore() # folder for scripts that need to be uploaded to Aml compute target script_folder = './scripts' os.makedirs(script_folder, exist_ok=True) shutil.copy(os.path.join(base_dir, 'video_decoding.py'), script_folder) shutil.copy(os.path.join(base_dir, 'pipelines_submit.py'), script_folder) shutil.copy(os.path.join(base_dir, 'pipelines_build.py'), script_folder) shutil.copy(os.path.join(base_dir, 'train.py'), script_folder) shutil.copy(os.path.join(base_dir, 'data_utils.py'), script_folder) shutil.copy(os.path.join(base_dir, 'prednet.py'), script_folder) shutil.copy(os.path.join(base_dir, 'keras_utils.py'), script_folder) shutil.copy(os.path.join(base_dir, 'data_preparation.py'), script_folder) shutil.copy(os.path.join(base_dir, 'model_registration.py'), script_folder) shutil.copy(os.path.join(base_dir, 'config.json'), script_folder) cpu_compute_name = config['cpu_compute'] try: cpu_compute_target = AmlCompute(ws, cpu_compute_name) print("found existing compute target: %s" % cpu_compute_name) except ComputeTargetException: print("creating new compute target") provisioning_config = AmlCompute.provisioning_configuration( vm_size='STANDARD_D2_V2', max_nodes=4, idle_seconds_before_scaledown=1800) cpu_compute_target = ComputeTarget.create(ws, cpu_compute_name, provisioning_config) cpu_compute_target.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20) # use get_status() to get a detailed status for the current cluster. print(cpu_compute_target.get_status().serialize()) # choose a name for your cluster gpu_compute_name = config['gpu_compute'] try: gpu_compute_target = AmlCompute(workspace=ws, name=gpu_compute_name) print("found existing compute target: %s" % gpu_compute_name) except ComputeTargetException: print('Creating a new compute target...') provisioning_config = AmlCompute.provisioning_configuration( vm_size='STANDARD_NC6', max_nodes=5, idle_seconds_before_scaledown=1800) # create the cluster gpu_compute_target = ComputeTarget.create(ws, gpu_compute_name, provisioning_config) # can poll for a minimum number of nodes and for a specific timeout. # if no min node count is provided it uses the scale settings for the cluster gpu_compute_target.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20) # use get_status() to get a detailed status for the current cluster. print(gpu_compute_target.get_status().serialize()) # conda dependencies for compute targets cpu_cd = CondaDependencies.create(conda_packages=["py-opencv=3.4.2"], pip_packages=[ "azure-storage-blob==1.5.0", "hickle==3.4.3", "requests==2.21.0", "sklearn", "pandas==0.24.2", "azureml-sdk==1.0.21", "numpy==1.16.2", "pillow==6.0.0" ]) gpu_cd = CondaDependencies.create(pip_packages=[ "keras==2.0.8", "theano==1.0.4", "tensorflow==1.8.0", "tensorflow-gpu==1.8.0", "hickle==3.4.3", "matplotlib==3.0.3", "seaborn==0.9.0", "requests==2.21.0", "bs4==0.0.1", "imageio==2.5.0", "sklearn", "pandas==0.24.2", "azureml-sdk==1.0.21", "numpy==1.16.2" ]) # Runconfigs cpu_compute_run_config = RunConfiguration(conda_dependencies=cpu_cd) cpu_compute_run_config.environment.docker.enabled = True cpu_compute_run_config.environment.docker.gpu_support = False cpu_compute_run_config.environment.docker.base_image = DEFAULT_CPU_IMAGE cpu_compute_run_config.environment.spark.precache_packages = False gpu_compute_run_config = RunConfiguration(conda_dependencies=gpu_cd) gpu_compute_run_config.environment.docker.enabled = True gpu_compute_run_config.environment.docker.gpu_support = True gpu_compute_run_config.environment.docker.base_image = DEFAULT_GPU_IMAGE gpu_compute_run_config.environment.spark.precache_packages = False print("PipelineData object created") video_data = DataReference(datastore=def_blob_store, data_reference_name="video_data", path_on_datastore=os.path.join( "prednet", "data", "video", dataset)) # Naming the intermediate data as processed_data1 and assigning it to the variable processed_data1. raw_data = PipelineData("raw_video_fames", datastore=def_blob_store) preprocessed_data = PipelineData("preprocessed_video_frames", datastore=def_blob_store) data_metrics = PipelineData("data_metrics", datastore=def_blob_store) data_output = PipelineData("output_data", datastore=def_blob_store) print("DataReference object created") # prepare dataset for training/testing prednet video_decoding = PythonScriptStep( name='decode_videos', script_name="video_decoding.py", arguments=["--input_data", video_data, "--output_data", raw_data], inputs=[video_data], outputs=[raw_data], compute_target=cpu_compute_target, source_directory=script_folder, runconfig=cpu_compute_run_config, allow_reuse=True, hash_paths=['.']) print("video_decode created") # prepare dataset for training/testing recurrent neural network data_prep = PythonScriptStep(name='prepare_data', script_name="data_preparation.py", arguments=[ "--input_data", raw_data, "--output_data", preprocessed_data ], inputs=[raw_data], outputs=[preprocessed_data], compute_target=cpu_compute_target, source_directory=script_folder, runconfig=cpu_compute_run_config, allow_reuse=True, hash_paths=['.']) data_prep.run_after(video_decoding) print("data_prep created") est = TensorFlow(source_directory=script_folder, compute_target=gpu_compute_target, pip_packages=[ 'keras==2.0.8', 'theano', 'tensorflow==1.8.0', 'tensorflow-gpu==1.8.0', 'matplotlib', 'horovod', 'hickle' ], entry_script='train.py', use_gpu=True, node_count=1) ps = RandomParameterSampling({ '--batch_size': choice(2, 4, 8, 16), '--filter_sizes': choice("3, 3, 3", "4, 4, 4", "5, 5, 5"), '--stack_sizes': choice("48, 96, 192", "36, 72, 144", "12, 24, 48"), #, "48, 96"), '--learning_rate': loguniform(-6, -1), '--lr_decay': loguniform(-9, -1), '--freeze_layers': choice("0, 1, 2", "1, 2, 3", "0, 1", "1, 2", "2, 3", "0", "1", "2", "3"), '--transfer_learning': choice("True", "False") }) policy = BanditPolicy(evaluation_interval=2, slack_factor=0.1, delay_evaluation=20) hdc = HyperDriveRunConfig( estimator=est, hyperparameter_sampling=ps, policy=policy, primary_metric_name='val_loss', primary_metric_goal=PrimaryMetricGoal.MINIMIZE, max_total_runs=5, #100, max_concurrent_runs=5, #10, max_duration_minutes=60 * 6) hd_step = HyperDriveStep(name="train_w_hyperdrive", hyperdrive_run_config=hdc, estimator_entry_script_arguments=[ '--data-folder', preprocessed_data, '--remote_execution' ], inputs=[preprocessed_data], metrics_output=data_metrics, allow_reuse=True) hd_step.run_after(data_prep) registration_step = PythonScriptStep( name='register_model', script_name='model_registration.py', arguments=['--input_dir', data_metrics, '--output_dir', data_output], compute_target=gpu_compute_target, inputs=[data_metrics], outputs=[data_output], source_directory=script_folder, allow_reuse=True, hash_paths=['.']) registration_step.run_after(hd_step) pipeline = Pipeline( workspace=ws, steps=[video_decoding, data_prep, hd_step, registration_step]) print("Pipeline is built") pipeline.validate() print("Simple validation complete") pipeline_name = 'prednet_' + dataset pipeline.publish(name=pipeline_name) return pipeline_name
def main(): e = Env() # Get Azure machine learning workspace aml_workspace = Workspace.get(name=e.workspace_name, subscription_id=e.subscription_id, resource_group=e.resource_group) print("get_workspace:") print(aml_workspace) # Get Azure machine learning cluster aml_compute = get_compute(aml_workspace, e.compute_name, e.vm_size) if aml_compute is not None: print("aml_compute:") print(aml_compute) run_config = RunConfiguration(conda_dependencies=CondaDependencies.create( conda_packages=[ 'numpy', 'pandas', 'scikit-learn', 'tensorflow', 'keras' ], pip_packages=[ 'azure', 'azureml-core', 'azure-storage', 'azure-storage-blob' ])) run_config.environment.docker.enabled = True config_envvar = {} if (e.collection_uri is not None and e.teamproject_name is not None): builduri_base = e.collection_uri + e.teamproject_name builduri_base = builduri_base + "/_build/results?buildId=" config_envvar["BUILDURI_BASE"] = builduri_base run_config.environment.environment_variables = config_envvar model_name_param = PipelineParameter(name="model_name", default_value=e.model_name) build_id_param = PipelineParameter(name="build_id", default_value=e.build_id) hyperparameter_alpha_param = PipelineParameter(name="hyperparameter_alpha", default_value=0.5) train_step = PythonScriptStep( name="Train Model", script_name=e.train_script_path, compute_target=aml_compute, source_directory=e.sources_directory_train, arguments=[ "--build_id", build_id_param, "--model_name", model_name_param, "--alpha", hyperparameter_alpha_param, ], runconfig=run_config, allow_reuse=False, ) print("Step Train created") evaluate_step = PythonScriptStep( name="Evaluate Model ", script_name=e.evaluate_script_path, compute_target=aml_compute, source_directory=e.sources_directory_train, arguments=[ "--build_id", build_id_param, "--model_name", model_name_param, ], runconfig=run_config, allow_reuse=False, ) print("Step Evaluate created") register_step = PythonScriptStep( name="Register Model ", script_name=e.register_script_path, compute_target=aml_compute, source_directory=e.sources_directory_train, arguments=[ "--build_id", build_id_param, "--model_name", model_name_param, ], runconfig=run_config, allow_reuse=False, ) print("Step Register created") evaluate_step.run_after(train_step) register_step.run_after(evaluate_step) steps = [train_step, evaluate_step, register_step] train_pipeline = Pipeline(workspace=aml_workspace, steps=steps) train_pipeline._set_experiment_name train_pipeline.validate() published_pipeline = train_pipeline.publish( name=e.pipeline_name, description="Model training/retraining pipeline", version=e.build_id) print(f'Published pipeline: {published_pipeline.name}') print(f'for build {published_pipeline.version}')
def main(): load_dotenv() workspace_name = os.environ.get("BASE_NAME") + "-AML-WS" resource_group = os.environ.get("BASE_NAME") + "-AML-RG" subscription_id = os.environ.get("SUBSCRIPTION_ID") tenant_id = os.environ.get("TENANT_ID") app_id = os.environ.get("SP_APP_ID") app_secret = os.environ.get("SP_APP_SECRET") sources_directory_train = os.environ.get("SOURCES_DIR_TRAIN") train_script_path = os.environ.get("TRAIN_SCRIPT_PATH") evaluate_script_path = os.environ.get("EVALUATE_SCRIPT_PATH") vm_size = os.environ.get("AML_COMPUTE_CLUSTER_CPU_SKU") compute_name = os.environ.get("AML_COMPUTE_CLUSTER_NAME") model_name = os.environ.get("MODEL_NAME") build_id = os.environ.get("BUILD_BUILDID") pipeline_name = os.environ.get("TRAINING_PIPELINE_NAME") # Get Azure machine learning workspace aml_workspace = get_workspace(workspace_name, resource_group, subscription_id, tenant_id, app_id, app_secret) print(aml_workspace) # Get Azure machine learning cluster aml_compute = get_compute(aml_workspace, compute_name, vm_size) if aml_compute is not None: print(aml_compute) run_config = RunConfiguration(conda_dependencies=CondaDependencies.create( conda_packages=[ 'numpy', 'pandas', 'scikit-learn', 'tensorflow', 'keras' ], pip_packages=[ 'azure', 'azureml-core', 'azure-storage', 'azure-storage-blob' ])) run_config.environment.docker.enabled = True model_name = PipelineParameter(name="model_name", default_value=model_name) release_id = PipelineParameter(name="release_id", default_value="0") train_step = PythonScriptStep( name="Train Model", script_name=train_script_path, compute_target=aml_compute, source_directory=sources_directory_train, arguments=[ "--release_id", release_id, "--model_name", model_name, ], runconfig=run_config, allow_reuse=False, ) print("Step Train created") evaluate_step = PythonScriptStep( name="Evaluate Model ", script_name=evaluate_script_path, compute_target=aml_compute, source_directory=sources_directory_train, arguments=[ "--release_id", release_id, "--model_name", model_name, ], runconfig=run_config, allow_reuse=False, ) print("Step Evaluate created") evaluate_step.run_after(train_step) steps = [evaluate_step] train_pipeline = Pipeline(workspace=aml_workspace, steps=steps) train_pipeline.validate() published_pipeline = train_pipeline.publish( name=pipeline_name, description="Model training/retraining pipeline", version=build_id) print(f'Published pipeline: {published_pipeline.name}') print(f'for build {published_pipeline.version}')
) print("trainStep created") # ### Create and Validate the Pipeline # # Note that the *trainStep* has implicit data dependency with the *processTrainDataStep* and thus you only include the *trainStep* in your Pipeline object. You will observe that when you run the pipeline that it will first run the **processTrainDataStep** followed by the **trainStep**. # In[ ]: pipeline = Pipeline(workspace=ws, steps=[trainStep]) print ("Pipeline is built") pipeline.validate() print("Simple validation complete") # ### Submit the Pipeline # # At this point you can run the pipeline and examine the output it produced. # In[ ]: pipeline_run = Experiment(ws, experiment_name).submit(pipeline) print("Pipeline is submitted for execution") # ### Monitor the Run Details
def main(): cluster_id = os.environ.get("DATABRICKS_CLUSTER_ID", None) # If databricks_cluster_id is not None, but it's an empty string: its None if cluster_id is not None and not cluster_id: cluster_id = None workspace_name = os.environ.get("AML_WORKSPACE_NAME", None) resource_group = os.environ.get("RESOURCE_GROUP", None) subscription_id = os.environ.get("SUBSCRIPTION_ID", None) tenant_id = os.environ.get("TENANT_ID", None) app_id = os.environ.get("SP_APP_ID", None) app_secret = os.environ.get("SP_APP_SECRET", None) experiment_subfolder = os.environ.get("EXPERIMENT_FOLDER", 'aml_service/experiment') sources_directory = os.environ.get("SOURCES_DIR", None) experiment_folder = os.path.join(sources_directory, experiment_subfolder) train_script_path = os.environ.get("TRAIN_SCRIPT_PATH", None) databricks_workspace_name = os.environ.get("DATABRICKS_WORKSPACE_NAME", None) databricks_access_token = os.environ.get("DATABRICKS_ACCESS_TOKEN", None) databricks_compute_name_aml = os.environ.get("DATABRICKS_COMPUTE_NAME_AML", None) model_dir = os.environ.get("MODEL_DIR", 'dbfs:/model') model_name = os.environ.get("MODEL_NAME", 'torchcnn') path_components = model_dir.split("/", 1) model_path = "/dbfs/" + path_components[1] + "/" + model_name + ".pth" print("The model path will be %s" % (model_path)) aml_workspace = get_workspace(workspace_name, resource_group, subscription_id, tenant_id, app_id, app_secret) print(aml_workspace) databricks_compute = get_compute(aml_workspace, databricks_compute_name_aml, resource_group, databricks_workspace_name, databricks_access_token) print(databricks_compute) step1 = DatabricksStep(name="DBPythonInLocalMachine", num_workers=1, python_script_name=train_script_path, source_directory=sources_directory, run_name='DB_Python_Local_demo', existing_cluster_id=cluster_id, compute_target=databricks_compute, allow_reuse=False, python_script_params=['--MODEL_PATH', model_path]) step2 = DatabricksStep(name="RegisterModel", num_workers=1, python_script_name="register_model.py", source_directory=experiment_folder, run_name='Register_model', existing_cluster_id=cluster_id, compute_target=databricks_compute, allow_reuse=False, python_script_params=[ '--MODEL_PATH', model_path, '--TENANT_ID', tenant_id, '--APP_ID', app_id, '--APP_SECRET', app_secret, '--MODEL_NAME', model_name ]) step2.run_after(step1) print("Step lists created") pipeline = Pipeline( workspace=aml_workspace, # steps=[step1]) steps=[step1, step2]) print("Pipeline is built") pipeline.validate() print("Pipeline validation complete") pipeline_run = pipeline.submit(experiment_name="pipetest") print("Pipeline is submitted for execution") pipeline_details = pipeline_run.get_details() pipeline_run_id = pipeline_details['runId'] azure_run_url = get_experiment_run_url(subscription_id, resource_group, workspace_name, pipeline_run_id) print("To check details of the Pipeline run, go to " + azure_run_url) pipeline_status = pipeline_run.get_status() timer_mod = 0 while pipeline_status == 'Running' or pipeline_status == 'NotStarted': timer_mod = timer_mod + 10 time.sleep(10) if (timer_mod % 30) == 0: print("Status: %s. %s seconds have passed." % (pipeline_status, timer_mod)) pipeline_status = pipeline_run.get_status() if pipeline_status == 'Failed': print("AML Pipelne failed. Check %s for details." % (azure_run_url)) sys.exit(1) else: print(pipeline_status) print("Pipeline completed")
def main(): load_dotenv() workspace_name = os.environ.get("BASE_NAME") + "-AML-WS" resource_group = "AML-RG-" + os.environ.get("BASE_NAME") subscription_id = os.environ.get("SUBSCRIPTION_ID") tenant_id = os.environ.get("TENANT_ID") app_id = os.environ.get("SP_APP_ID") app_secret = os.environ.get("SP_APP_SECRET") sources_directory_train = os.environ.get("SOURCES_DIR_TRAIN") train_script_path = os.environ.get("TRAIN_SCRIPT_PATH") evaluate_script_path = os.environ.get("EVALUATE_SCRIPT_PATH") vm_size = os.environ.get("AML_COMPUTE_CLUSTER_SKU") compute_name = os.environ.get("AML_COMPUTE_CLUSTER_NAME") aks_name = os.environ.get("AKS_CLUSTER_NAME") model_name = os.environ.get("MODEL_NAME") build_id = os.environ.get("BUILD_BUILDID") pipeline_name = os.environ.get("TRAINING_PIPELINE_NAME") experiment_name = os.environ.get("EXPERIMENT_NAME") # Get Azure machine learning workspace aml_workspace = get_workspace(workspace_name, resource_group, subscription_id, tenant_id, app_id, app_secret) print(aml_workspace) # Get Azure machine learning cluster aml_compute = get_compute(aml_workspace, compute_name, vm_size) if aml_compute is not None: print(aml_compute) run_config = RunConfiguration(conda_dependencies=CondaDependencies.create( conda_packages=['numpy', 'pandas', 'scikit-learn', 'keras'], pip_packages=[ 'azure', 'azureml-sdk', 'azure-storage', 'azure-storage-blob', 'transformers>=2.1.1', 'tensorflow>=2.0.0', 'tensorflow-gpu>=2.0.0' ])) run_config.environment.docker.enabled = True datastore_name = 'tfworld' container_name = 'azure-service-classifier' account_name = 'johndatasets' sas_token = '?sv=2019-02-02&ss=bfqt&srt=sco&sp=rl&se=2021-06-02T03:40:25Z&st=2020-03-09T19:40:25Z&spr=https&sig=bUwK7AJUj2c%2Fr90Qf8O1sojF0w6wRFgL2c9zMVCWNPA%3D' try: existing_datastore = Datastore.get(aml_workspace, datastore_name) except: # noqa: E722 existing_datastore = Datastore \ .register_azure_blob_container(workspace=aml_workspace, datastore_name=datastore_name, container_name=container_name, account_name=account_name, sas_token=sas_token ) azure_dataset = Dataset.File.from_files(path=(existing_datastore, 'data')) azure_dataset = azure_dataset.register( workspace=aml_workspace, name='Azure Services Dataset', description='Dataset containing azure related posts on Stackoverflow', create_new_version=True) azure_dataset.to_path() input_data = azure_dataset.as_named_input('input_data1').as_mount( '/tmp/data') model_name = PipelineParameter(name="model_name", default_value=model_name) max_seq_length = PipelineParameter(name="max_seq_length", default_value=128) learning_rate = PipelineParameter(name="learning_rate", default_value=3e-5) num_epochs = PipelineParameter(name="num_epochs", default_value=3) export_dir = PipelineParameter(name="export_dir", default_value="./outputs/exports") batch_size = PipelineParameter(name="batch_size", default_value=32) steps_per_epoch = PipelineParameter(name="steps_per_epoch", default_value=100) # initialize the TensorFlow estimator estimator = TensorFlow(source_directory=sources_directory_train, entry_script=train_script_path, compute_target=aml_compute, framework_version='2.0', use_gpu=True, pip_packages=[ 'transformers==2.0.0', 'azureml-dataprep[fuse,pandas]==1.3.0' ]) train_step = EstimatorStep( name="Train Model", estimator=estimator, estimator_entry_script_arguments=[ "--data_dir", input_data, "--max_seq_length", max_seq_length, "--learning_rate", learning_rate, "--num_epochs", num_epochs, "--export_dir", export_dir, "--batch_size", batch_size, "--steps_per_epoch", steps_per_epoch ], compute_target=aml_compute, inputs=[input_data], allow_reuse=False, ) print("Step Train created") evaluate_step = PythonScriptStep( name="Evaluate Model ", script_name=evaluate_script_path, compute_target=aml_compute, source_directory=sources_directory_train, arguments=[ "--model_name", model_name, "--build_id", build_id, ], runconfig=run_config, allow_reuse=False, ) print("Step Evaluate created") # Currently, the Evaluate step will automatically register # the model if it performs better. This step is based on a # previous version of the repo which utilized JSON files to # track evaluation results. evaluate_step.run_after(train_step) steps = [evaluate_step] train_pipeline = Pipeline(workspace=aml_workspace, steps=steps) train_pipeline.validate() published_pipeline = train_pipeline.publish( name=pipeline_name, description="Model training/retraining pipeline", version=build_id) print(f'Published pipeline: {published_pipeline.name}') print(f'for build {published_pipeline.version}') response = published_pipeline.submit( # noqa: F841 workspace=aml_workspace, experiment_name=experiment_name) # Get AKS cluster for deployment aks_compute = get_aks(aml_workspace, aks_name) if aks_compute is not None: print(aks_compute)
def main(): load_dotenv() workspace_name = os.environ.get("BASE_NAME") + "-AML-WS" resource_group = os.environ.get("BASE_NAME") + "-AML-RG" subscription_id = os.environ.get("SUBSCRIPTION_ID") tenant_id = os.environ.get("TENANT_ID") app_id = os.environ.get("SP_APP_ID") app_secret = os.environ.get("SP_APP_SECRET") sources_directory_train = os.environ.get("SOURCES_DIR_TRAIN") train_script_path = os.environ.get("TRAIN_SCRIPT_PATH") evaluate_script_path = os.environ.get("EVALUATE_SCRIPT_PATH") register_script_path = os.environ.get("REGISTER_SCRIPT_PATH") vm_size_cpu = os.environ.get("AML_COMPUTE_CLUSTER_CPU_SKU") compute_name_cpu = os.environ.get("AML_COMPUTE_CLUSTER_NAME") model_name = os.environ.get("MODEL_NAME") # Get Azure machine learning workspace aml_workspace = get_workspace(workspace_name, resource_group, subscription_id, tenant_id, app_id, app_secret) print(aml_workspace) # Get Azure machine learning cluster aml_compute_cpu = get_compute(aml_workspace, compute_name_cpu, vm_size_cpu) if aml_compute_cpu is not None: print(aml_compute_cpu) run_config = RunConfiguration(conda_dependencies=CondaDependencies.create( conda_packages=[ 'numpy', 'pandas', 'scikit-learn', 'tensorflow', 'keras' ], pip_packages=[ 'azure', 'azureml-core', 'azure-storage', 'azure-storage-blob' ])) run_config.environment.docker.enabled = True model_name = PipelineParameter(name="model_name", default_value=model_name) def_blob_store = Datastore(aml_workspace, "workspaceblobstore") jsonconfigs = PipelineData("jsonconfigs", datastore=def_blob_store) config_suffix = datetime.datetime.now().strftime("%Y%m%d%H") train_step = PythonScriptStep( name="Train Model", script_name=train_script_path, compute_target=aml_compute_cpu, source_directory=sources_directory_train, arguments=[ "--config_suffix", config_suffix, "--json_config", jsonconfigs, "--model_name", model_name, ], runconfig=run_config, # inputs=[jsonconfigs], outputs=[jsonconfigs], allow_reuse=False, ) print("Step Train created") evaluate_step = PythonScriptStep( name="Evaluate Model ", script_name=evaluate_script_path, compute_target=aml_compute_cpu, source_directory=sources_directory_train, arguments=[ "--config_suffix", config_suffix, "--json_config", jsonconfigs, ], runconfig=run_config, inputs=[jsonconfigs], # outputs=[jsonconfigs], allow_reuse=False, ) print("Step Evaluate created") register_model_step = PythonScriptStep( name="Register New Trained Model", script_name=register_script_path, compute_target=aml_compute_cpu, source_directory=sources_directory_train, arguments=[ "--config_suffix", config_suffix, "--json_config", jsonconfigs, "--model_name", model_name, ], runconfig=run_config, inputs=[jsonconfigs], # outputs=[jsonconfigs], allow_reuse=False, ) print("Step register model created") evaluate_step.run_after(train_step) register_model_step.run_after(evaluate_step) steps = [register_model_step] train_pipeline = Pipeline(workspace=aml_workspace, steps=steps) train_pipeline.validate() published_pipeline = train_pipeline.publish( name="training-pipeline", description="Model training/retraining pipeline") train_pipeline_json = {} train_pipeline_json["rest_endpoint"] = published_pipeline.endpoint json_file_path = "ml_service/pipelines/train_pipeline.json" with open(json_file_path, "w") as outfile: json.dump(train_pipeline_json, outfile)
def main(): e = Env() # Get Azure machine learning workspace aml_workspace = Workspace.get(name=e.workspace_name, subscription_id=e.subscription_id, resource_group=e.resource_group) print("get_workspace:") print(aml_workspace) # Get Azure machine learning cluster aml_compute = get_compute(aml_workspace, e.compute_name, e.vm_size) if aml_compute is not None: print("aml_compute:") print(aml_compute) # Create a reusable Azure ML environment environment = get_environment(aml_workspace, e.aml_env_name, create_new=False) # NOQA: E501 run_config = RunConfiguration() run_config.environment = environment if (e.datastore_name): datastore_name = e.datastore_name else: datastore_name = aml_workspace.get_default_datastore().name run_config.environment.environment_variables[ "DATASTORE_NAME"] = datastore_name # NOQA: E501 model_name_param = PipelineParameter(name="model_name", default_value=e.model_name) dataset_version_param = PipelineParameter(name="dataset_version", default_value=e.dataset_version) data_file_path_param = PipelineParameter(name="data_file_path", default_value="none") caller_run_id_param = PipelineParameter(name="caller_run_id", default_value="none") # Get dataset name dataset_name = e.dataset_name # # Check to see if dataset exists # if (dataset_name not in aml_workspace.datasets): # # Create dataset from lacemlops sample data # sample_data = load_lacemlops() # df = pd.DataFrame( # data=sample_data.data, # columns=sample_data.feature_names) # df['Y'] = sample_data.target # file_name = 'lacemlops.csv' # df.to_csv(file_name, index=False) # # Upload file to default datastore in workspace # datatstore = Datastore.get(aml_workspace, datastore_name) # target_path = 'training-data/' # datatstore.upload_files( # files=[file_name], # target_path=target_path, # overwrite=True, # show_progress=False) # # Register dataset # path_on_datastore = os.path.join(target_path, file_name) # dataset = Dataset.Tabular.from_delimited_files( # path=(datatstore, path_on_datastore)) # dataset = dataset.register( # workspace=aml_workspace, # name=dataset_name, # description='lacemlops training data', # tags={'format': 'CSV'}, # create_new_version=True) # Create a PipelineData to pass data between steps pipeline_data = PipelineData( 'pipeline_data', datastore=aml_workspace.get_default_datastore()) train_step = PythonScriptStep( name="Train Model", script_name=e.train_script_path, compute_target=aml_compute, source_directory=e.sources_directory_train, outputs=[pipeline_data], arguments=[ "--model_name", model_name_param, "--step_output", pipeline_data, "--dataset_version", dataset_version_param, "--data_file_path", data_file_path_param, "--caller_run_id", caller_run_id_param, "--dataset_name", dataset_name, ], runconfig=run_config, allow_reuse=False, ) print("Step Train created") evaluate_step = PythonScriptStep( name="Evaluate Model ", script_name=e.evaluate_script_path, compute_target=aml_compute, source_directory=e.sources_directory_train, arguments=[ "--model_name", model_name_param, "--allow_run_cancel", e.allow_run_cancel, ], runconfig=run_config, allow_reuse=False, ) print("Step Evaluate created") register_step = PythonScriptStep( name="Register Model ", script_name=e.register_script_path, compute_target=aml_compute, source_directory=e.sources_directory_train, inputs=[pipeline_data], arguments=[ "--model_name", model_name_param, "--step_input", pipeline_data, ], runconfig=run_config, allow_reuse=False, ) print("Step Register created") # Check run_evaluation flag to include or exclude evaluation step. if ((e.run_evaluation).lower() == 'true'): print("Include evaluation step before register step.") evaluate_step.run_after(train_step) register_step.run_after(evaluate_step) steps = [train_step, evaluate_step, register_step] else: print("Exclude evaluation step and directly run register step.") register_step.run_after(train_step) steps = [train_step, register_step] train_pipeline = Pipeline(workspace=aml_workspace, steps=steps) train_pipeline._set_experiment_name train_pipeline.validate() published_pipeline = train_pipeline.publish( name=e.pipeline_name, description="Model training/retraining pipeline", version=e.build_id) print(f'Published pipeline: {published_pipeline.name}') print(f'for build {published_pipeline.version}')
def main(): e = Env() # Get Azure machine learning workspace aml_workspace = Workspace.get(name=e.workspace_name, subscription_id=e.subscription_id, resource_group=e.resource_group) print("get_workspace:") print(aml_workspace) # Get Azure machine learning cluster aml_compute = get_compute(aml_workspace, e.compute_name, e.vm_size) if aml_compute is not None: print("aml_compute:") print(aml_compute) # Create a reusable run configuration environment # Read definition from diabetes_regression/azureml_environment.json environment = Environment.load_from_directory(e.sources_directory_train) if (e.collection_uri is not None and e.teamproject_name is not None): builduri_base = e.collection_uri + e.teamproject_name builduri_base = builduri_base + "/_build/results?buildId=" environment.environment_variables["BUILDURI_BASE"] = builduri_base environment.register(aml_workspace) run_config = RunConfiguration() run_config.environment = environment model_name_param = PipelineParameter(name="model_name", default_value=e.model_name) build_id_param = PipelineParameter(name="build_id", default_value=e.build_id) # Get dataset name dataset_name = e.dataset_name # Check to see if dataset exists if (dataset_name not in aml_workspace.datasets): # Create dataset from diabetes sample data sample_data = load_diabetes() df = pd.DataFrame(data=sample_data.data, columns=sample_data.feature_names) df['Y'] = sample_data.target file_name = 'diabetes.csv' df.to_csv(file_name, index=False) # Upload file to default datastore in workspace default_ds = aml_workspace.get_default_datastore() target_path = 'training-data/' default_ds.upload_files(files=[file_name], target_path=target_path, overwrite=True, show_progress=False) # Register dataset path_on_datastore = os.path.join(target_path, file_name) dataset = Dataset.Tabular.from_delimited_files( path=(default_ds, path_on_datastore)) dataset = dataset.register(workspace=aml_workspace, name=dataset_name, description='diabetes training data', tags={'format': 'CSV'}, create_new_version=True) # Get the dataset dataset = Dataset.get_by_name(aml_workspace, dataset_name) # Create a PipelineData to pass data between steps pipeline_data = PipelineData( 'pipeline_data', datastore=aml_workspace.get_default_datastore()) train_step = PythonScriptStep( name="Train Model", script_name=e.train_script_path, compute_target=aml_compute, source_directory=e.sources_directory_train, inputs=[dataset.as_named_input('training_data')], outputs=[pipeline_data], arguments=[ "--build_id", build_id_param, "--model_name", model_name_param, "--step_output", pipeline_data ], runconfig=run_config, allow_reuse=False, ) print("Step Train created") evaluate_step = PythonScriptStep( name="Evaluate Model ", script_name=e.evaluate_script_path, compute_target=aml_compute, source_directory=e.sources_directory_train, arguments=[ "--build_id", build_id_param, "--model_name", model_name_param, "--allow_run_cancel", e.allow_run_cancel, ], runconfig=run_config, allow_reuse=False, ) print("Step Evaluate created") register_step = PythonScriptStep( name="Register Model ", script_name=e.register_script_path, compute_target=aml_compute, source_directory=e.sources_directory_train, inputs=[pipeline_data], arguments=[ "--build_id", build_id_param, "--model_name", model_name_param, "--step_input", pipeline_data, ], runconfig=run_config, allow_reuse=False, ) print("Step Register created") # Check run_evaluation flag to include or exclude evaluation step. if ((e.run_evaluation).lower() == 'true'): print("Include evaluation step before register step.") evaluate_step.run_after(train_step) register_step.run_after(evaluate_step) steps = [train_step, evaluate_step, register_step] else: print("Exclude evaluation step and directly run register step.") register_step.run_after(train_step) steps = [train_step, register_step] train_pipeline = Pipeline(workspace=aml_workspace, steps=steps) train_pipeline._set_experiment_name train_pipeline.validate() published_pipeline = train_pipeline.publish( name=e.pipeline_name, description="Model training/retraining pipeline", version=e.build_id) print(f'Published pipeline: {published_pipeline.name}') print(f'for build {published_pipeline.version}')