def __init__(self, workspace, name, datastore_type, container_name, account_name, sas_token=None, account_key=None, protocol=None, endpoint=None): """Class AbstractAzureStorageDatastore constructor. :param workspace: The workspace this datastore belongs to. :type workspace: azureml.core.workspace.Workspace :param name: The name of the datastore. It can only contain alphanumeric characters or - or _. :type name: str :param datastore_type: The type of this datastore, either "AzureBlob" or "AzureFile". :type datastore_type: str :param container_name: The container name. :type container_name: str :param account_name: The storage account name. :type account_name: str :param sas_token: The SAS token for accessing this container, defaults to None. :type sas_token: str, optional :param account_key: The storage account key, defaults to None. :type account_key: str, optional :param protocol: The protocol to use to connect to the storage account. If None, defaults to https. :type protocol: str, optional :param endpoint: The endpoint of the blob container. If None, defaults to core.windows.net. :type endpoint: str, optional """ super(AbstractAzureStorageDatastore, self).__init__(workspace, name, datastore_type) self.container_name = container_name self.account_name = account_name self.sas_token = sas_token self.account_key = account_key self.credential_type = 'None' self.protocol = protocol self.endpoint = endpoint if account_key: self.credential_type = 'AccountKey' if sas_token: self.credential_type = 'Sas' self._num_workers = 32 self._data_reference = DataReference(datastore=self)
def get_by_data_reference(cls, workspace, path): data_store = Datastore(workspace, cls.DEFAULT_GLOBAL_DATASET_STORE) return DataReference( datastore=data_store, data_reference_name=cls.DEFAULT_DATA_REFERENCE_NAME, path_on_datastore=path, )
def upload(self, src_dir, target_path=None, overwrite=False, show_progress=True): """Upload the data from the local file system to blob container this data store points to. :param src_dir: The local directory to upload. :type src_dir: str :param target_path: The location in blob container to upload to. If None, then upload to root. Defaults to None. :type target_path: str :param overwrite: Indicates whether to overwrite existing files. Defaults to False. :type overwrite: bool, optional :param show_progress: Indicates whether to show progress of the upload in the console. Defaults to True. :type show_progress: bool, optional :return: The DataReference instance for the target path uploaded. :rtype: azureml.data.data_reference.DataReference """ module_logger.info("Called AzureBlobDatastore.upload") self._ensure_credential("Upload") target_path = target_path or "" count = self._start_upload_task( self._get_upload_from_dir(src_dir, target_path), overwrite, lambda target_file_path: self.blob_service.exists( self.container_name, target_file_path), show_progress, lambda target, source: lambda: self.blob_service. create_blob_from_path(self.container_name, target, source)) module_logger.info( "Finished AzureBlobDatastore.upload with count={0}.".format(count)) return DataReference(datastore=self, path_on_datastore=target_path)
def upload(self, src_dir, target_path=None, overwrite=False, show_progress=True): """Upload the data from the local file system to the file share this datastore points to. :param src_dir: The local directory to upload. :type src_dir: str :param target_path: The location in file share to upload to. If None then upload to root. :type target_path: str :param overwrite: Indicates whether to overwrite existing files. :type overwrite: bool, optional :param show_progress: Indicates whether to show the progress of upload in the console. :type show_progress: bool, optional :return: The DataReference instance for the target path uploaded. :rtype: azureml.data.data_reference.DataReference """ module_logger.info("Called AzureFileDatastore.upload") target_path = target_path or "" count = self._start_upload_task( self._get_upload_from_dir(src_dir, target_path), overwrite, lambda target_file_path: self.file_service.exists( self.container_name, os.path.split(target_file_path)[0], os.path.split(target_file_path)[1]), show_progress, self._file_share_upload) module_logger.info( "Finished AzureFileDatastore.upload with count={0}.".format(count)) return DataReference(datastore=self, path_on_datastore=target_path)
def _get_data_references(self, request_id, internal_datastore): print( 'AMLCompute, _get_data_references() called. Request ID: {}'.format( request_id)) # Argument Datastore Name needs to: only contain alphanumeric characters and _. request_id_to_use_for_datastore = request_id.replace('-', '_') try: # setting the overwrite flag to True overwrites any datastore that was created previously with that name # internal_datastore stores all user-facing files: list of images, detection results, list of failed images # and it so happens that each job also needs the list of images as an input internal_datastore_name = 'internal_datastore_{}'.format( request_id_to_use_for_datastore) internal_account_name = internal_datastore['account_name'] internal_account_key = internal_datastore['account_key'] internal_container_name = internal_datastore['container_name'] internal_datastore = Datastore.register_azure_blob_container( self.ws, internal_datastore_name, internal_container_name, internal_account_name, account_key=internal_account_key) print('internal_datastore done') # output_datastore stores the output from score.py in each job, which is another container # in the same storage account as interl_datastore output_datastore_name = 'output_datastore_{}'.format( request_id_to_use_for_datastore) output_container_name = api_config.AML_CONTAINER output_datastore = Datastore.register_azure_blob_container( self.ws, output_datastore_name, output_container_name, internal_account_name, account_key=internal_account_key) print('output_datastore done') except Exception as e: raise RuntimeError( 'Error in connecting to the datastores for AML Compute: {}'. format(str(e))) try: internal_dir = DataReference(datastore=internal_datastore, data_reference_name='internal_dir', mode='mount') output_dir = PipelineData( 'output_{}'.format(request_id_to_use_for_datastore), datastore=output_datastore, output_mode='mount') print('Finished setting up the Data References.') except Exception as e: raise RuntimeError( 'Error in creating data references for AML Compute: {}.'. format(str(e))) return internal_dir, output_dir
def _setup_datareference(self, name, path): """ helper function to setup a datareference object in AzureML. :param str name: [required] name of the data reference\ :param str path: [required] path on the datastore where the data lives. :returns: input_data :rtype: DataReference """ input_data = DataReference(datastore=self.blob_ds, data_reference_name=name, path_on_datastore=path) return input_data
def upload_files(self, files, relative_root=None, target_path=None, overwrite=False, show_progress=True): """Upload the data from the local file system to the blob container this datastore points to. :param files: A list of absolute paths of files to upload. :type files: builtin.list[str] :param relative_root: The root used to determine the path of the files in the blob. For example, if we upload /path/to/file.txt, and we define base path to be /path, when file.txt is uploaded to the blob storage, it will have the path of /to/file.txt. If target_path is also given, then it will be used as the prefix for the derived path from above. The base path must be a common path of all of the files, otherwise an exception will be thrown. Defaults to None, which will find the common path. :type relative_root: str, optional :param target_path: The location in the blob container to upload the data to. Defaults to None, the root. :type target_path: str, optional :param overwrite: Indicates whether to overwrite existing files. Defaults to False. :type overwrite: bool, optional :param show_progress: Indicates whether to show progress of the upload in the console. Defaults to True. :type show_progress: bool, optional :return: The DataReference instance for the target path uploaded. :rtype: azureml.data.data_reference.DataReference """ module_logger.info("Called AzureBlobDatastore.upload_files") target_path = target_path or "" relative_root = relative_root or common_path(files) count = self._start_upload_task( self._get_upload_from_files(files, target_path, relative_root, False), overwrite, lambda target_file_path: self.blob_service.exists( self.container_name, target_file_path), show_progress, lambda target, source: lambda: self.blob_service. create_blob_from_path(self.container_name, target, source)) module_logger.info( "Finished AzureBlobDatastore.upload with count={0}.".format(count)) return DataReference(datastore=self, path_on_datastore=target_path)
def createDataReference(workspace, storage_name, storage_key, storage_container_name, data_store_name, data_reference_name): ''' If no present, registers a new azureml.core.datastore.Datastore Once the data store is in hand it creates an instance of azureml.data.data_reference.DataReference that can be used in an Azure ML pipeline step. PARAMS: workspace : azureml.core.Workspace : Existing AMLS Workspace storage_name : string : Name of the Azure Storage Account storage_key : string : Access Key to the Azure Storage Account storage_container_name : string : Container name to recieve blobs. Must exist data_store_name : string : Name of the registere data store. data_reference_name : string : Name of the data reference RETURNS: tuple(azureml.core.datastore.Datastore, azureml.data.data_reference.DataReference) ''' data_store = None try: data_store = Datastore.get(workspace, data_store_name) print("Found existing data store - ", data_store_name) except Exception as ex: print("Creating data store - ", data_store_name) data_store = Datastore.register_azure_blob_container( workspace, datastore_name=data_store_name, container_name=storage_container_name, account_name=storage_name, account_key=storage_key, ) if data_store == None: raise Exception("Could not create/find data store.") return data_store, DataReference(datastore=data_store, data_reference_name=data_reference_name)
def upload_files(files, datastore, relative_root=None, target_path=None, overwrite=False, show_progress=True): from azureml.data._upload_helper import _start_upload_task, _get_upload_from_files from azureml.data.data_reference import DataReference target_path = target_path or "" _file_exists(dstore=datastore, path=target_path) relative_root = relative_root or common_path(files) _start_upload_task( _get_upload_from_files(files, target_path, relative_root, True), overwrite, lambda target_file_path: _file_exists( dstore=datastore, path=target_file_path), show_progress, lambda target, source: lambda: _upload_file(base_path=relative_root, local_file_path=source, remote_target_path=target_path, datastore=datastore, overwrite=overwrite)) return DataReference(datastore=datastore, path_on_datastore=target_path)
def upload_dir(src_dir, remote_target_path, datastore, overwrite=False, show_progress=True): from azureml.data._upload_helper import _start_upload_task, _get_upload_from_dir from azureml.data.data_reference import DataReference remote_target_path = remote_target_path or "" _file_exists(dstore=datastore, path=remote_target_path) _start_upload_task( _get_upload_from_dir(src_dir, remote_target_path), overwrite, lambda target_file_path: _file_exists(dstore=datastore, path=target_file_path), show_progress, lambda target, source: lambda: _upload_file( base_path=src_dir, local_file_path=source, remote_target_path=remote_target_path, datastore=datastore, overwrite=overwrite)) return DataReference(datastore=datastore, path_on_datastore=remote_target_path)
def my_azure_app(cfg: DictConfig) -> None: print(cfg.pretty()) args_dict = OmegaConf.to_container(cfg, resolve=False) yaml_file_nm = args_dict["yaml_file"].split("/")[-1].split(".")[0] conf_file = os.path.join( args_dict["root_path"], yaml_file_nm + "_" + str(datetime.datetime.now()) + ".json", ) print(conf_file) with open(conf_file, "w") as out: out.write(json.dumps(args_dict)) # First, list the supported VM families for Azure Machine Learning Compute # ws = Workspace.get('experiments') cluster_name = "gpucluster" experiment_name = args_dict["experiment_name"] + "_azure" disable_gpu = args_dict["disable_gpu"] script_folder = "." # todo. this is overriden by hydra script_folder = (hydra.utils.get_original_cwd() ) # todo. this is overriden by hydra data_path = os.path.join(args_dict["root_path"], args_dict["data_subdir"]) sub_id = os.getenv("AZ_SUBS_ID") assert sub_id is not None # Edit a run configuration property on the fly. run_local = RunConfiguration() run_local.environment.python.user_managed_dependencies = True ws = Workspace.get( name="experiments", subscription_id=sub_id, resource_group="default_resource_group", ) # print(AmlCompute.supported_vmsizes(workspace=ws)) # Create a new runconfig object _ = RunConfiguration() # Signal that you want to use AmlCompute to execute the script # run_temp_compute.target = "amlcompute" # AmlCompute is created in the same region as your workspace # Set the VM size for AmlCompute from the list of supported_vmsizes try: compute_target = ComputeTarget(workspace=ws, name=cluster_name) print("Found existing compute target") except ComputeTargetException: print("Creating a new compute target...") compute_config = AmlCompute.provisioning_configuration( vm_size=args_dict["vm_size"], max_nodes=1) compute_target = ComputeTarget.create(ws, cluster_name, compute_config) compute_target.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=10) s = ws.get_default_datastore() # A reference to the root_path in azure after uplaoding _ = s.upload( src_dir=data_path, target_path=data_path, overwrite=False, show_progress=True, ) # All path except file_name # script_target_path = "/".join(args_dict['yaml_file'].split("/")[:-1]) script_target_path = "/".join( conf_file.split("/")[:-1]) # All path except file_name print(script_target_path) # script_fname = args.config_file.split("/")[-1] script_fname = conf_file.split("/")[-1] print(script_fname) print("---" * 100) azure_script_path = s.upload_files( files=[conf_file], target_path=script_target_path, overwrite=True, show_progress=True, ) print(azure_script_path) azure_script_abs_path = DataReference(datastore=s, data_reference_name="input_data", path_on_datastore=conf_file) azure_root_path = DataReference( datastore=s, data_reference_name="root_data", path_on_datastore=args_dict["root_path"], ) exp = Experiment(workspace=ws, name=experiment_name) # src = ScriptRunConfig(source_directory = script_folder, # script = 'run.py', arguments=['--config_file', 'local/pairs.json'], # run_config = run_temp_compute) # Using pytorch estimator - proper way to submit pytorch jobs script_params = { "--config_file": azure_script_abs_path, "--root_path": azure_root_path, "--experiment_name": experiment_name, } print("GPU Disabled: {}".format(disable_gpu)) estimator = PyTorch( source_directory=script_folder, script_params=script_params, compute_target=compute_target, entry_script="run.py", use_gpu=not disable_gpu, pip_packages=["pillow==5.4.1"], ) # you can name this as run _ = exp.submit(estimator)
cpu_compute_target.wait_for_completion(show_output=True) # Create GPU compute target print('Creating GPU compute target ...') gpu_cluster_name = 'k80cluster' gpu_compute_config = AmlCompute.provisioning_configuration( vm_size='Standard_NC6', idle_seconds_before_scaledown=1200, min_nodes=0, max_nodes=2) gpu_compute_target = ComputeTarget.create(workspace, gpu_cluster_name, gpu_compute_config) gpu_compute_target.wait_for_completion(show_output=True) # Get datastore reference datastore = DataReference(datastore, mode='mount') # Step 1: Data ingestion data_ingestion_step, data_ingestion_outputs = data_ingestion_step( datastore, cpu_compute_target) # Step 2: Data preprocessing data_preprocess_step, data_preprocess_outputs = data_preprocess_step( data_ingestion_outputs['raw_data_dir'], cpu_compute_target) # Step 3: Train Model train_step, train_outputs = train_step(data_preprocess_outputs['train_dir'], data_preprocess_outputs['valid_dir'], gpu_compute_target) # Step 4: Evaluate Model
def partition_by(self, partition_keys, target, name=None, show_progress=True, partition_as_file_dataset=False): """Partitioned data will be copied and output to the destination specified by target. create the dataset from the outputted data path with partition format, register dataset if name is provided, return the dataset for the new data path with partitions .. code-block:: python ds = Dataset.get_by_name('test') # indexed by country, state, partition_date # #1: call partition_by locally new_ds = ds.partition_by(name="repartitioned_ds", partition_keys=['country'], target=DataPath(datastore, "repartition")) partition_keys = newds.partition_keys # ['country'] # new_ds can be passed to PRS as input dataset :param partition_keys: Required, partition keys :type partition_keys: builtin.list[str] :param target: Required, the datastore path where the dataframe parquet data will be uploaded to. A guid folder will be generated under the target path to avoid conflict. :type target: azureml.data.datapath.DataPath, azureml.core.datastore.Datastore or tuple(azureml.core.datastore.Datastore, str) object :param name: Optional, The registration name. :type name: str :param show_progress: Optional, indicates whether to show progress of the upload in the console. Defaults to be True. :type show_progress: bool :param partition_as_file_dataset: Optional, indicates whether returns a filedataset or not. Defaults to be False. :type show_progress: bool :return: The saved or registered dataset. :rtype: azureml.data.TabularDataset """ from uuid import uuid4 from azureml.exceptions import UserErrorException from azureml.core import Dataset from azureml.data.data_reference import DataReference from azureml.data._dataset_factory_helper import get_progress_logger, parse_target from azureml.dataprep import FieldType from azureml.data.dataset_factory import TabularDatasetFactory import time starting_time = time.process_time() console = get_progress_logger(show_progress) console("Validating arguments.") if len(partition_keys) == 0: raise UserErrorException("partition_keys cannot be empty") column_types = self._dataflow.dtypes invalid_keys = [] for key in partition_keys: if key not in column_types: invalid_keys.append(key) if len(invalid_keys) != 0: raise UserErrorException( "{0} are invalid partition keys".format(invalid_keys)) if len(partition_keys) != len(set(partition_keys)): raise UserErrorException("partition_keys cannot have duplicates") console("Arguments validated.") guid = uuid4() datastore, relative_path = parse_target(target) relative_path_with_guid = "/%s/%s/" % (relative_path, guid) partition_format = relative_path_with_guid partition_path = relative_path_with_guid saved_dataset_key_column_types = {} for key in partition_keys: if column_types[key] == FieldType.DATE: partition_format = partition_format + '{' + key + ':yyyyMMddHHmmss}*/' del column_types[key] else: partition_format = partition_format + '{' + key + '}/' partition_path = partition_path + '*/' if key in column_types: saved_dataset_key_column_types[key] = column_types[key] partition_format = partition_format + '*.parquet' partition_path = partition_path + '*.parquet' console("Uploading file to {}".format(relative_path_with_guid)) self._dataflow.write_to_parquet( partition_keys=partition_keys, directory_path=DataReference(datastore=datastore).path( relative_path_with_guid)).run_local() console("Successfully uploaded file to datastore.") console("Creating a new dataset.") if partition_as_file_dataset: saved_dataset = Dataset.File.\ from_files(path=(datastore, partition_path), partition_format=partition_format) else: saved_dataset = TabularDatasetFactory.\ from_parquet_files(path=(datastore, partition_path), partition_format=partition_format) saved_dataset = TabularDataset._create( saved_dataset._dataflow.set_column_types( saved_dataset_key_column_types), self._properties, telemetry_info=self._telemetry_info) console("Successfully created a new dataset.") if self._registration and self._registration.workspace: collect_datasets_usage( _get_logger(), _PATITION_BY_ACTIVITY, [self], self._registration.workspace, "N/A", { "execution_time": time.process_time() - starting_time, "number_of_partition_keys": len(partition_keys) }) if name is None: return saved_dataset console("registering a new dataset.") registered_dataset = saved_dataset.register(datastore.workspace, name, create_new_version=True) console("Successfully created and registered a new dataset.") return registered_dataset
service_principal = ServicePrincipalAuthentication( tenant_id=tenant_id, service_principal_id=application_id, service_principal_password=app_secret) ws = Workspace.get(name=workspace_name, subscription_id=subscription_id, resource_group=resource_group, auth=service_principal) # Retrieve the pointer to the default Blob storage. def_blob_store = Datastore(ws, "workspaceblobstore") print("Blobstore's name: {}".format(def_blob_store.name)) blob_input_data = DataReference(datastore=def_blob_store, data_reference_name="mnist_datainput", path_on_datastore="mnist_datainput") print("DataReference object created") # Create a CPU cluster of type D2 V2 with 1 node. (due to subscription's limitations we stick to 1 node) try: compute_target_cpu = ComputeTarget(workspace=ws, name=cluster_name_cpu) print('Found existing compute target.') except ComputeTargetException: print('Creating a new compute target...') # CPU: Standard_D3_v2 # GPU: Standard_NV6 compute_config = AmlCompute.provisioning_configuration( vm_size='STANDARD_D2_V2', max_nodes=1, min_nodes=1)
print("Default datastore's name: {}".format(def_file_store.name)) def_blob_store = Datastore(ws, "workspaceblobstore") print("Blobstore's name: {}".format(def_blob_store.name)) # In[ ]: # Upload the raw training data to the blob storage def_blob_store.upload(src_dir=data_location, target_path='nyc-taxi-raw-features', overwrite=True, show_progress=True) raw_train_data = DataReference(datastore=def_blob_store, data_reference_name="nyc_taxi_raw_features", path_on_datastore="nyc-taxi-raw-features/nyc-taxi-sample-data.csv") print("DataReference object created") # ### Create the Process Training Data Pipeline Step # The intermediate data (or output of a Step) is represented by PipelineData object. PipelineData can be produced by one step and consumed in another step by providing the PipelineData object as an output of one step and the input of one or more steps. # # The process training data pipeline step takes the raw_train_data DataReference object as input, and it will output an intermediate PipelineData object that holds the processed training data with the new engineered features for datetime components: hour of the day, and day of the week. # # Review and run the cell below to construct the PipelineData objects and the PythonScriptStep pipeline step: # # *Open preprocess.py in the local machine and examine the arguments, inputs, and outputs for the script. Note that there is an argument called process_mode to distinguish between processing training data vs test data. Reviewing the Python script file will give you a good sense of why the script argument names used below are important.* # In[ ]:
"PUT YOUR STORAGE ACCOUNT KEY HERE") # Storage account key try: blob_datastore = Datastore.get(ws, blob_datastore_name) print("Found Blob Datastore with name: %s" % blob_datastore_name) except: blob_datastore = Datastore.register_azure_blob_container( workspace=ws, datastore_name=blob_datastore_name, account_name=account_name, # Storage account name container_name=container_name, # Name of Azure blob container account_key=account_key) # Storage account key print("Registered blob datastore with name: %s" % blob_datastore_name) blob_data_ref = DataReference(datastore=blob_datastore, data_reference_name="blob_test_data", path_on_datastore="testdata") csv_path = (blob_datastore, '/creditcard.csv') try: tab_ds = Dataset.Tabular.from_delimited_files(path=csv_path) tab_ds = tab_ds.register(workspace=ws, name='creditcard') except Exception as ex: print(ex) else: print('Dataset already registered.') creditds = ws.datasets['creditcard'] df = creditds.to_pandas_dataframe() default_ds = ws.get_default_datastore()
path_on_datastore="training_set_labels.csv") train_labels_path_parameter = PipelineParameter( name="train_labels", default_value=train_labels_datapath) train_labels_path = (train_labels_path_parameter, DataPathComputeBinding(mode="mount")) test_features_datapath = DataPath(datastore=blobstore, path_on_datastore="test_set_features.csv") test_features_path_parameter = PipelineParameter( name="test_features", default_value=test_features_datapath) test_features_path = (test_features_path_parameter, DataPathComputeBinding(mode="mount")) submission_format_path = DataReference( data_reference_name="submission_format", datastore=blobstore, path_on_datastore="submission_format.csv", ) submission_path = PipelineData(name="submission", datastore=blobstore) model_path = PipelineData(name="model", datastore=blobstore) step = PythonScriptStep( script_name="script.py", source_directory="script", name="flu_shot_learning", arguments=[ train_features_path, train_labels_path, test_features_path,
def build_prednet_pipeline(dataset, ws): print("building pipeline for dataset %s in workspace %s" % (dataset, ws.name)) base_dir = "." def_blob_store = ws.get_default_datastore() # folder for scripts that need to be uploaded to Aml compute target script_folder = "./scripts" os.makedirs(script_folder) shutil.copytree(os.path.join(base_dir, "models"), os.path.join(base_dir, script_folder, "models")) shutil.copy(os.path.join(base_dir, "train.py"), script_folder) shutil.copy(os.path.join(base_dir, "data_preparation.py"), script_folder) shutil.copy(os.path.join(base_dir, "register_prednet.py"), script_folder) shutil.copy(os.path.join(base_dir, "batch_scoring.py"), script_folder) shutil.copy(os.path.join(base_dir, "train_clf.py"), script_folder) shutil.copy(os.path.join(base_dir, "register_clf.py"), script_folder) cpu_compute_name = args.cpu_compute_name cpu_compute_target = AmlCompute(ws, cpu_compute_name) print("found existing compute target: %s" % cpu_compute_name) # use get_status() to get a detailed status for the current cluster. print(cpu_compute_target.get_status().serialize()) # choose a name for your cluster gpu_compute_name = args.gpu_compute_name gpu_compute_target = AmlCompute(workspace=ws, name=gpu_compute_name) print(gpu_compute_target.get_status().serialize()) env = Environment.get(ws, "prednet") # Runconfigs runconfig = RunConfiguration() runconfig.environment = env print("PipelineData object created") # DataReference to where raw data is stored. raw_data = DataReference( datastore=def_blob_store, data_reference_name="raw_data", path_on_datastore=os.path.join("prednet", "data", "raw_data"), ) print("DataReference object created") # Naming the intermediate data as processed_data and assigning it to the # variable processed_data. preprocessed_data = PipelineData("preprocessed_data", datastore=def_blob_store) data_metrics = PipelineData("data_metrics", datastore=def_blob_store) hd_child_cwd = PipelineData("prednet_model_path", datastore=def_blob_store) # prednet_path = PipelineData("outputs", datastore=def_blob_store) scored_data = PipelineData("scored_data", datastore=def_blob_store) model_path = PipelineData("model_path", datastore=def_blob_store) # prepare dataset for training/testing recurrent neural network data_prep = PythonScriptStep( name="prepare_data", script_name="data_preparation.py", arguments=[ "--raw_data", raw_data, "--preprocessed_data", preprocessed_data, "--dataset", dataset, ], inputs=[raw_data], outputs=[preprocessed_data], compute_target=cpu_compute_target, source_directory=script_folder, runconfig=runconfig, allow_reuse=True, ) # data_prep.run_after(video_decoding) print("data_prep step created") est = Estimator( source_directory=script_folder, compute_target=gpu_compute_target, entry_script="train.py", node_count=1, environment_definition=env, ) ps = BayesianParameterSampling({ "--batch_size": choice(1, 2, 4, 10), "--filter_sizes": choice("3, 3, 3", "4, 4, 4", "5, 5, 5"), "--stack_sizes": choice("48, 96, 192", "36, 72, 144", "12, 24, 48"), "--learning_rate": uniform(1e-6, 1e-3), "--lr_decay": uniform(1e-9, 1e-2), "--freeze_layers": choice("0, 1, 2", "1, 2, 3", "0, 1", "1, 2", "2, 3", "0", "3"), # "--fine_tuning": choice("True", "False"), }) hdc = HyperDriveConfig( estimator=est, hyperparameter_sampling=ps, primary_metric_name="val_loss", primary_metric_goal=PrimaryMetricGoal.MINIMIZE, max_total_runs=3, max_concurrent_runs=3, max_duration_minutes=60 * 6, ) train_prednet = HyperDriveStep( "train_w_hyperdrive", hdc, estimator_entry_script_arguments=[ "--preprocessed_data", preprocessed_data, "--remote_execution", "--dataset", dataset, ], inputs=[preprocessed_data], outputs=[hd_child_cwd], metrics_output=data_metrics, allow_reuse=True, ) train_prednet.run_after(data_prep) register_prednet = PythonScriptStep( name="register_prednet", script_name="register_prednet.py", arguments=[ "--data_metrics", data_metrics, ], compute_target=cpu_compute_target, inputs=[data_metrics, hd_child_cwd], source_directory=script_folder, allow_reuse=True, ) register_prednet.run_after(train_prednet) batch_scoring = PythonScriptStep( name="batch_scoring", script_name="batch_scoring.py", arguments=[ "--preprocessed_data", preprocessed_data, "--scored_data", scored_data, "--dataset", dataset, # "--prednet_path", # prednet_path ], compute_target=gpu_compute_target, inputs=[preprocessed_data], outputs=[scored_data], source_directory=script_folder, runconfig=runconfig, allow_reuse=True, ) batch_scoring.run_after(register_prednet) train_clf = PythonScriptStep( name="train_clf", script_name="train_clf.py", arguments=[ "--preprocessed_data", preprocessed_data, "--scored_data", scored_data, "--model_path", model_path ], compute_target=cpu_compute_target, inputs=[preprocessed_data, scored_data], outputs=[model_path], source_directory=script_folder, runconfig=runconfig, allow_reuse=True, ) train_clf.run_after(batch_scoring) register_clf = PythonScriptStep( name="register_clf", script_name="register_clf.py", arguments=["--model_path", model_path], inputs=[model_path], compute_target=cpu_compute_target, source_directory=script_folder, allow_reuse=True, runconfig=runconfig, ) register_clf.run_after(train_clf) pipeline = Pipeline( workspace=ws, steps=[ data_prep, train_prednet, register_prednet, batch_scoring, train_clf, register_clf, ], ) pipeline.validate() pipeline_name = "prednet_" + dataset published_pipeline = pipeline.publish(name=pipeline_name) _ = Schedule.create( workspace=ws, name=pipeline_name + "_sch", pipeline_id=published_pipeline.id, experiment_name=pipeline_name, datastore=def_blob_store, wait_for_provisioning=True, description="Datastore scheduler for Pipeline" + pipeline_name, path_on_datastore=os.path.join("prednet/data/raw_data", dataset, "Train"), polling_interval=60 * 24, ) published_pipeline.submit(ws, pipeline_name)
# get the workspace print("Getting a reference to workspace %s" % workspace_name) ws = Workspace.get(name=workspace_name, subscription_id=subscription_id, resource_group=resource_group) experiment = Experiment(workspace=ws, name='automl-diabetes') aml_compute = AmlCompute(ws, compute_target_name) # read in the data print("Getting a reference to default datastore") datastore = ws.get_default_datastore() print("Preparing the 'prep data' step") blob_diabetes_data = DataReference( datastore=datastore, data_reference_name="diabetes_data", path_on_datastore="diabetesdata/diabetes_pima.csv") # Create a new runconfig object aml_run_config = RunConfiguration() aml_run_config.target = aml_compute aml_run_config.environment.docker.enabled = True aml_run_config.environment.docker.base_image = DEFAULT_CPU_IMAGE aml_run_config.environment.python.user_managed_dependencies = False aml_run_config.environment.python.conda_dependencies = CondaDependencies.create( conda_packages=['pandas', 'scikit-learn', 'numpy'], pip_packages=[ 'azureml-sdk', 'azureml-dataprep', 'azureml-dataprep[pandas]', 'azureml-train-automl' ], pin_sdk_version=False)
def main(): e = Env() aml_workspace = Workspace.get( name=e.workspace_name, subscription_id=e.subscription_id, resource_group=e.resource_group ) print("get_workspace:") print(aml_workspace) aml_compute = get_compute( aml_workspace, e.compute_name, e.vm_size) if aml_compute is not None: print("aml_compute:") print(aml_compute) environment = get_environment( aml_workspace, e.aml_env_name, create_new=e.rebuild_env) run_config = RunConfiguration() run_config.environment = environment if (e.datastore_name): datastore_name = e.datastore_name else: datastore_name = aml_workspace.get_default_datastore().name run_config.environment.environment_variables["DATASTORE_NAME"] \ = datastore_name dataset_name = e.dataset_name file_name = e.file_name datastore = Datastore.get(aml_workspace, datastore_name) if (dataset_name not in aml_workspace.datasets): raise Exception("Could not find dataset at \"%s\"." % dataset_name) else: dataset = Dataset.get_by_name(aml_workspace, name=dataset_name) dataset.download(target_path='.', overwrite=True) datastore.upload_files([file_name], target_path=dataset_name, overwrite=True) raw_data_file = DataReference(datastore=datastore, data_reference_name="Raw_Data_File", path_on_datastore=dataset_name + '/' + file_name) clean_data_file = PipelineParameter(name="clean_data_file", default_value="/clean_data.csv") clean_data_folder = PipelineData("clean_data_folder", datastore=datastore) prepDataStep = PythonScriptStep(name="Prepare Data", source_directory=e.sources_directory_train, script_name=e.data_prep_script_path, arguments=["--raw_data_file", raw_data_file, "--clean_data_folder", clean_data_folder, "--clean_data_file", clean_data_file], inputs=[raw_data_file], outputs=[clean_data_folder], compute_target=aml_compute, allow_reuse=False) print("Step Prepare Data created") new_model_file = PipelineParameter(name="new_model_file ", default_value='/' + e.model_name + '.pkl') new_model_folder = PipelineData("new_model_folder", datastore=datastore) est = SKLearn(source_directory=e.sources_directory_train, entry_script=e.train_script_path, pip_packages=['azureml-sdk', 'scikit-learn==0.20.3', 'azureml-dataprep[pandas,fuse]>=1.1.14'], compute_target=aml_compute) trainingStep = EstimatorStep( name="Model Training", estimator=est, estimator_entry_script_arguments=["--clean_data_folder", clean_data_folder, "--new_model_folder", new_model_folder, "--clean_data_file", clean_data_file.default_value, "--new_model_file", new_model_file.default_value], runconfig_pipeline_params=None, inputs=[clean_data_folder], outputs=[new_model_folder], compute_target=aml_compute, allow_reuse=False) print("Step Train created") model_name_param = PipelineParameter(name="model_name", default_value=e.model_name) evaluateStep = PythonScriptStep( name="Evaluate Model", source_directory=e.sources_directory_train, script_name=e.evaluate_script_path, arguments=["--model_name", model_name_param], compute_target=aml_compute, allow_reuse=False) print("Step Evaluate created") registerStep = PythonScriptStep( name="Register Model", source_directory=e.sources_directory_train, script_name=e.register_script_path, arguments=["--new_model_folder", new_model_folder, "--new_model_file", new_model_file, "--model_name", model_name_param], inputs=[new_model_folder], compute_target=aml_compute, allow_reuse=False) print("Step Register created") if ((e.run_evaluation).lower() == 'true'): print("Include evaluation step before register step.") trainingStep.run_after(prepDataStep) evaluateStep.run_after(trainingStep) registerStep.run_after(evaluateStep) else: print("Exclude evaluation step and directly run register step.") trainingStep.run_after(prepDataStep) registerStep.run_after(trainingStep) pipeline = Pipeline(workspace=aml_workspace, steps=[registerStep]) pipeline.validate() print("Pipeline is built") pipeline._set_experiment_name published_pipeline = pipeline.publish( name=e.pipeline_name, description="Predict Employee Retention Model training pipeline", version=e.build_id ) print(f'Published pipeline: {published_pipeline.name}') print(f'for build {published_pipeline.version}')
def build_pipeline(dataset, ws, config): print("building pipeline for dataset %s in workspace %s" % (dataset, ws.name)) base_dir = '.' def_blob_store = ws.get_default_datastore() # folder for scripts that need to be uploaded to Aml compute target script_folder = './scripts' os.makedirs(script_folder, exist_ok=True) shutil.copy(os.path.join(base_dir, 'video_decoding.py'), script_folder) shutil.copy(os.path.join(base_dir, 'pipelines_submit.py'), script_folder) shutil.copy(os.path.join(base_dir, 'pipelines_create.py'), script_folder) shutil.copy(os.path.join(base_dir, 'train.py'), script_folder) shutil.copy(os.path.join(base_dir, 'data_utils.py'), script_folder) shutil.copy(os.path.join(base_dir, 'prednet.py'), script_folder) shutil.copy(os.path.join(base_dir, 'keras_utils.py'), script_folder) shutil.copy(os.path.join(base_dir, 'data_preparation.py'), script_folder) shutil.copy(os.path.join(base_dir, 'model_registration.py'), script_folder) shutil.copy(os.path.join(base_dir, 'config.json'), script_folder) cpu_compute_name = config['cpu_compute'] try: cpu_compute_target = AmlCompute(ws, cpu_compute_name) print("found existing compute target: %s" % cpu_compute_name) except:# ComputeTargetException: print("creating new compute target") provisioning_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2', max_nodes=4, idle_seconds_before_scaledown=1800) cpu_compute_target = ComputeTarget.create(ws, cpu_compute_name, provisioning_config) cpu_compute_target.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20) # use get_status() to get a detailed status for the current cluster. print(cpu_compute_target.get_status().serialize()) # choose a name for your cluster gpu_compute_name = config['gpu_compute'] try: gpu_compute_target = AmlCompute(workspace=ws, name=gpu_compute_name) print("found existing compute target: %s" % gpu_compute_name) except: print('Creating a new compute target...') provisioning_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_NC6', max_nodes=10, idle_seconds_before_scaledown=1800) # create the cluster gpu_compute_target = ComputeTarget.create(ws, gpu_compute_name, provisioning_config) # can poll for a minimum number of nodes and for a specific timeout. # if no min node count is provided it uses the scale settings for the cluster gpu_compute_target.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20) # use get_status() to get a detailed status for the current cluster. try: print(gpu_compute_target.get_status().serialize()) except BaseException as e: print("Could not get status of compute target.") print(e) # conda dependencies for compute targets cpu_cd = CondaDependencies.create(conda_packages=["py-opencv=3.4.2"], pip_indexurl='https://azuremlsdktestpypi.azureedge.net/sdk-release/Candidate/604C89A437BA41BD942B4F46D9A3591D', pip_packages=["azure-storage-blob==1.5.0", "hickle==3.4.3", "requests==2.21.0", "sklearn", "pandas==0.24.2", "azureml-sdk", "numpy==1.16.2", "pillow==6.0.0"]) # Runconfigs cpu_compute_run_config = RunConfiguration(conda_dependencies=cpu_cd) cpu_compute_run_config.environment.docker.enabled = True cpu_compute_run_config.environment.docker.gpu_support = False cpu_compute_run_config.environment.docker.base_image = DEFAULT_CPU_IMAGE cpu_compute_run_config.environment.spark.precache_packages = False print("PipelineData object created") # DataReference to where video data is stored. video_data = DataReference( datastore=def_blob_store, data_reference_name="video_data", path_on_datastore=os.path.join("prednet", "data", "video", dataset)) print("DataReference object created") # Naming the intermediate data as processed_data1 and assigning it to the variable processed_data1. raw_data = PipelineData("raw_video_fames", datastore=def_blob_store) preprocessed_data = PipelineData("preprocessed_video_frames", datastore=def_blob_store) data_metrics = PipelineData("data_metrics", datastore=def_blob_store) data_output = PipelineData("output_data", datastore=def_blob_store) # prepare dataset for training/testing prednet video_decoding = PythonScriptStep( name='decode_videos', script_name="video_decoding.py", arguments=["--input_data", video_data, "--output_data", raw_data], inputs=[video_data], outputs=[raw_data], compute_target=cpu_compute_target, source_directory=script_folder, runconfig=cpu_compute_run_config, allow_reuse=True, hash_paths=['.'] ) print("video_decode step created") # prepare dataset for training/testing recurrent neural network data_prep = PythonScriptStep( name='prepare_data', script_name="data_preparation.py", arguments=["--input_data", raw_data, "--output_data", preprocessed_data], inputs=[raw_data], outputs=[preprocessed_data], compute_target=cpu_compute_target, source_directory=script_folder, runconfig=cpu_compute_run_config, allow_reuse=True, hash_paths=['.'] ) data_prep.run_after(video_decoding) print("data_prep step created") # configure access to ACR for pulling our custom docker image acr = ContainerRegistry() acr.address = config['acr_address'] acr.username = config['acr_username'] acr.password = config['acr_password'] est = Estimator(source_directory=script_folder, compute_target=gpu_compute_target, entry_script='train.py', use_gpu=True, node_count=1, custom_docker_image = "wopauli_1.8-gpu:1", image_registry_details=acr, user_managed=True ) ps = RandomParameterSampling( { '--batch_size': choice(1, 2, 4, 8), '--filter_sizes': choice("3, 3, 3", "4, 4, 4", "5, 5, 5"), '--stack_sizes': choice("48, 96, 192", "36, 72, 144", "12, 24, 48"), #, "48, 96"), '--learning_rate': loguniform(-6, -1), '--lr_decay': loguniform(-9, -1), '--freeze_layers': choice("0, 1, 2", "1, 2, 3", "0, 1", "1, 2", "2, 3", "0", "3"), '--transfer_learning': choice("True", "False") } ) policy = BanditPolicy(evaluation_interval=2, slack_factor=0.1, delay_evaluation=10) hdc = HyperDriveConfig(estimator=est, hyperparameter_sampling=ps, policy=policy, primary_metric_name='val_loss', primary_metric_goal=PrimaryMetricGoal.MINIMIZE, max_total_runs=10, max_concurrent_runs=5, max_duration_minutes=60*6 ) hd_step = HyperDriveStep( name="train_w_hyperdrive", hyperdrive_run_config=hdc, estimator_entry_script_arguments=[ '--data-folder', preprocessed_data, '--remote_execution', '--dataset', dataset ], inputs=[preprocessed_data], metrics_output = data_metrics, allow_reuse=True ) hd_step.run_after(data_prep) registration_step = PythonScriptStep( name='register_model', script_name='model_registration.py', arguments=['--input_dir', data_metrics, '--output_dir', data_output], compute_target=cpu_compute_target, inputs=[data_metrics], outputs=[data_output], source_directory=script_folder, allow_reuse=True, hash_paths=['.'] ) registration_step.run_after(hd_step) pipeline = Pipeline(workspace=ws, steps=[video_decoding, data_prep, hd_step, registration_step]) print ("Pipeline is built") pipeline.validate() print("Simple validation complete") pipeline_name = 'prednet_' + dataset published_pipeline = pipeline.publish(name=pipeline_name) schedule = Schedule.create(workspace=ws, name=pipeline_name + "_sch", pipeline_id=published_pipeline.id, experiment_name=pipeline_name, datastore=def_blob_store, wait_for_provisioning=True, description="Datastore scheduler for Pipeline" + pipeline_name, path_on_datastore=os.path.join('prednet/data/video', dataset, 'Train'), polling_interval=1 ) return pipeline_name
def build_pipeline(dataset, ws, config): print("building pipeline for dataset %s in workspace %s" % (dataset, ws.name)) hostname = socket.gethostname() if hostname == 'wopauliNC6': base_dir = '.' else: base_dir = '.' def_blob_store = ws.get_default_datastore() # folder for scripts that need to be uploaded to Aml compute target script_folder = './scripts' os.makedirs(script_folder, exist_ok=True) shutil.copy(os.path.join(base_dir, 'video_decoding.py'), script_folder) shutil.copy(os.path.join(base_dir, 'pipelines_submit.py'), script_folder) shutil.copy(os.path.join(base_dir, 'pipelines_build.py'), script_folder) shutil.copy(os.path.join(base_dir, 'train.py'), script_folder) shutil.copy(os.path.join(base_dir, 'data_utils.py'), script_folder) shutil.copy(os.path.join(base_dir, 'prednet.py'), script_folder) shutil.copy(os.path.join(base_dir, 'keras_utils.py'), script_folder) shutil.copy(os.path.join(base_dir, 'data_preparation.py'), script_folder) shutil.copy(os.path.join(base_dir, 'model_registration.py'), script_folder) shutil.copy(os.path.join(base_dir, 'config.json'), script_folder) cpu_compute_name = config['cpu_compute'] try: cpu_compute_target = AmlCompute(ws, cpu_compute_name) print("found existing compute target: %s" % cpu_compute_name) except ComputeTargetException: print("creating new compute target") provisioning_config = AmlCompute.provisioning_configuration( vm_size='STANDARD_D2_V2', max_nodes=4, idle_seconds_before_scaledown=1800) cpu_compute_target = ComputeTarget.create(ws, cpu_compute_name, provisioning_config) cpu_compute_target.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20) # use get_status() to get a detailed status for the current cluster. print(cpu_compute_target.get_status().serialize()) # choose a name for your cluster gpu_compute_name = config['gpu_compute'] try: gpu_compute_target = AmlCompute(workspace=ws, name=gpu_compute_name) print("found existing compute target: %s" % gpu_compute_name) except ComputeTargetException: print('Creating a new compute target...') provisioning_config = AmlCompute.provisioning_configuration( vm_size='STANDARD_NC6', max_nodes=5, idle_seconds_before_scaledown=1800) # create the cluster gpu_compute_target = ComputeTarget.create(ws, gpu_compute_name, provisioning_config) # can poll for a minimum number of nodes and for a specific timeout. # if no min node count is provided it uses the scale settings for the cluster gpu_compute_target.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20) # use get_status() to get a detailed status for the current cluster. print(gpu_compute_target.get_status().serialize()) # conda dependencies for compute targets cpu_cd = CondaDependencies.create(conda_packages=["py-opencv=3.4.2"], pip_packages=[ "azure-storage-blob==1.5.0", "hickle==3.4.3", "requests==2.21.0", "sklearn", "pandas==0.24.2", "azureml-sdk==1.0.21", "numpy==1.16.2", "pillow==6.0.0" ]) gpu_cd = CondaDependencies.create(pip_packages=[ "keras==2.0.8", "theano==1.0.4", "tensorflow==1.8.0", "tensorflow-gpu==1.8.0", "hickle==3.4.3", "matplotlib==3.0.3", "seaborn==0.9.0", "requests==2.21.0", "bs4==0.0.1", "imageio==2.5.0", "sklearn", "pandas==0.24.2", "azureml-sdk==1.0.21", "numpy==1.16.2" ]) # Runconfigs cpu_compute_run_config = RunConfiguration(conda_dependencies=cpu_cd) cpu_compute_run_config.environment.docker.enabled = True cpu_compute_run_config.environment.docker.gpu_support = False cpu_compute_run_config.environment.docker.base_image = DEFAULT_CPU_IMAGE cpu_compute_run_config.environment.spark.precache_packages = False gpu_compute_run_config = RunConfiguration(conda_dependencies=gpu_cd) gpu_compute_run_config.environment.docker.enabled = True gpu_compute_run_config.environment.docker.gpu_support = True gpu_compute_run_config.environment.docker.base_image = DEFAULT_GPU_IMAGE gpu_compute_run_config.environment.spark.precache_packages = False print("PipelineData object created") video_data = DataReference(datastore=def_blob_store, data_reference_name="video_data", path_on_datastore=os.path.join( "prednet", "data", "video", dataset)) # Naming the intermediate data as processed_data1 and assigning it to the variable processed_data1. raw_data = PipelineData("raw_video_fames", datastore=def_blob_store) preprocessed_data = PipelineData("preprocessed_video_frames", datastore=def_blob_store) data_metrics = PipelineData("data_metrics", datastore=def_blob_store) data_output = PipelineData("output_data", datastore=def_blob_store) print("DataReference object created") # prepare dataset for training/testing prednet video_decoding = PythonScriptStep( name='decode_videos', script_name="video_decoding.py", arguments=["--input_data", video_data, "--output_data", raw_data], inputs=[video_data], outputs=[raw_data], compute_target=cpu_compute_target, source_directory=script_folder, runconfig=cpu_compute_run_config, allow_reuse=True, hash_paths=['.']) print("video_decode created") # prepare dataset for training/testing recurrent neural network data_prep = PythonScriptStep(name='prepare_data', script_name="data_preparation.py", arguments=[ "--input_data", raw_data, "--output_data", preprocessed_data ], inputs=[raw_data], outputs=[preprocessed_data], compute_target=cpu_compute_target, source_directory=script_folder, runconfig=cpu_compute_run_config, allow_reuse=True, hash_paths=['.']) data_prep.run_after(video_decoding) print("data_prep created") est = TensorFlow(source_directory=script_folder, compute_target=gpu_compute_target, pip_packages=[ 'keras==2.0.8', 'theano', 'tensorflow==1.8.0', 'tensorflow-gpu==1.8.0', 'matplotlib', 'horovod', 'hickle' ], entry_script='train.py', use_gpu=True, node_count=1) ps = RandomParameterSampling({ '--batch_size': choice(2, 4, 8, 16), '--filter_sizes': choice("3, 3, 3", "4, 4, 4", "5, 5, 5"), '--stack_sizes': choice("48, 96, 192", "36, 72, 144", "12, 24, 48"), #, "48, 96"), '--learning_rate': loguniform(-6, -1), '--lr_decay': loguniform(-9, -1), '--freeze_layers': choice("0, 1, 2", "1, 2, 3", "0, 1", "1, 2", "2, 3", "0", "1", "2", "3"), '--transfer_learning': choice("True", "False") }) policy = BanditPolicy(evaluation_interval=2, slack_factor=0.1, delay_evaluation=20) hdc = HyperDriveRunConfig( estimator=est, hyperparameter_sampling=ps, policy=policy, primary_metric_name='val_loss', primary_metric_goal=PrimaryMetricGoal.MINIMIZE, max_total_runs=5, #100, max_concurrent_runs=5, #10, max_duration_minutes=60 * 6) hd_step = HyperDriveStep(name="train_w_hyperdrive", hyperdrive_run_config=hdc, estimator_entry_script_arguments=[ '--data-folder', preprocessed_data, '--remote_execution' ], inputs=[preprocessed_data], metrics_output=data_metrics, allow_reuse=True) hd_step.run_after(data_prep) registration_step = PythonScriptStep( name='register_model', script_name='model_registration.py', arguments=['--input_dir', data_metrics, '--output_dir', data_output], compute_target=gpu_compute_target, inputs=[data_metrics], outputs=[data_output], source_directory=script_folder, allow_reuse=True, hash_paths=['.']) registration_step.run_after(hd_step) pipeline = Pipeline( workspace=ws, steps=[video_decoding, data_prep, hd_step, registration_step]) print("Pipeline is built") pipeline.validate() print("Simple validation complete") pipeline_name = 'prednet_' + dataset pipeline.publish(name=pipeline_name) return pipeline_name
# use get_status() to get a detailed status for the current cluster. try: print(gpu_compute_target.get_status().serialize()) except BaseException as e: print("Could not get status of compute target.") print(e) print("PipelineData object created") # This is where data is expected to be found in Azure Blob storage path_on_datastore = os.path.join("knowledge_distillation", "data") # DataReference to where is the input dataset stored labeled_data = DataReference(datastore=def_blob_store, data_reference_name="labeled_data", path_on_datastore=path_on_datastore) print("DataReference object created") # Conda dependencies for compute targets gpu_cd = CondaDependencies.create(conda_packages=['cudatoolkit=10.0.130'], pip_packages=[ 'keras', 'tensorflow', 'tensorflow-gpu', 'matplotlib', 'pillow', 'six', 'numpy', 'azureml-sdk', 'tqdm' ]) # Runconfig gpu_compute_run_config = RunConfiguration(conda_dependencies=gpu_cd) gpu_compute_run_config.environment.docker.enabled = True gpu_compute_run_config.environment.docker.gpu_support = True
old_datastore = [ds for ds in ws.datastores if ds == "telemetry"] if old_datastore: old_ds = Datastore.get(ws, "telemetry") old_ds.unregister() telemetry_ds = Datastore.register_azure_blob_container( workspace=ws, datastore_name='telemetry', container_name=args.storage_container, account_name=args.storage_account, account_key=args.storage_key, ) input_data = DataReference( datastore=telemetry_ds, data_reference_name="input_data", path_on_datastore=args.storage_path, ) preprocessing_est = SKLearn( source_directory='010-preprocessing', compute_target=cpu_cluster, entry_script='dataprep.py', conda_packages=['pandas'], pip_packages=['fastavro'], ) output = PipelineData("output", datastore=telemetry_ds) preprocessing_step = EstimatorStep( name="Preprocessing_Train", estimator=preprocessing_est,
def main(): """ Builds the Azure ML pipeline for data engineering and model training. """ databricks_workspace_name = os.environ['DATABRICKS_WORKSPACE_NAME'] training_data_account_name = os.environ['TRAINING_DATA_ACCOUNT_NAME'] build_id = os.getenv('BUILD_BUILDID', 0) # Get Azure machine learning workspace aml_workspace = Workspace.get( name=os.environ['AML_WORKSPACE_NAME'], subscription_id=os.environ['SUBSCRIPTION_ID'], resource_group=os.environ['RESOURCE_GROUP'], ) print(aml_workspace) # Generate Databricks credentials, see https://aka.ms/databricks-aad dbricks_region = aml_workspace.location dbricks_api = f"https://{dbricks_region}.azuredatabricks.net/api/2.0" dbricks_client = databricks_client.create(dbricks_api) dbricks_client.auth_azuread(resource_group=aml_workspace.resource_group, workspace_name=databricks_workspace_name) dbricks_client.ensure_available() # Attach Databricks as Azure ML training compute dbricks_compute_name = "databricks" dbricks_compute = get_databricks_compute( aml_workspace, dbricks_compute_name, ) if dbricks_compute is None: pat_token = dbricks_client.post( 'token/create', json={"comment": "Azure ML Token generated by Build " + build_id })['token_value'] dbricks_compute = create_databricks_compute( aml_workspace, databricks_workspace_name, dbricks_compute_name, pat_token, ) print("dbricks_compute:") print(dbricks_compute) # Create Databricks instance pool pool_name = "azureml_training" instance_pool_id = get_instance_pool(dbricks_client, pool_name) if not instance_pool_id: dbricks_client.post('instance-pools/create', json={ "instance_pool_name": pool_name, "node_type_id": "Standard_D3_v2", "idle_instance_autotermination_minutes": 10, "preloaded_spark_versions": [DATABRICKS_RUNTIME_VERSION], }) instance_pool_id = get_instance_pool(dbricks_client, pool_name) notebook_folder = f"/Shared/AzureMLDeployed" workspace_datastore = Datastore(aml_workspace, "workspaceblobstore") # Create a datastore for the training data container credentials, subscription = get_azure_cli_credentials() storage_client = StorageManagementClient(credentials, subscription) training_storage_keys = storage_client.storage_accounts.list_keys( aml_workspace.resource_group, training_data_account_name) training_datastore = Datastore.register_azure_blob_container( workspace=aml_workspace, datastore_name="trainingdata", container_name="trainingdata", account_name=training_data_account_name, account_key=training_storage_keys.keys[0].value, ) # FEATURE ENGINEERING STEP (DATABRICKS) # Create feature engineering pipeline step training_data_input = DataReference(datastore=training_datastore, path_on_datastore="/", data_reference_name="training") feature_eng_output = PipelineData("feature_engineered", datastore=workspace_datastore) notebook_path = upload_notebook(dbricks_client, notebook_folder, "code/prepare", "feature_engineering") training_dataprep_step = DatabricksStep( name="FeatureEngineering", inputs=[training_data_input], outputs=[feature_eng_output], spark_version=DATABRICKS_RUNTIME_VERSION, instance_pool_id=instance_pool_id, num_workers=3, notebook_path=notebook_path, run_name="FeatureEngineering", compute_target=dbricks_compute, allow_reuse=True, ) # You can add Azure ML model training tasks using # feature_eng_output as input. # ... # Create Azure ML Pipeline steps = [training_dataprep_step] ml_pipeline = Pipeline(workspace=aml_workspace, steps=steps) ml_pipeline.validate() published_pipeline = ml_pipeline.publish( name="Feature Engineering", description="Feature engineering pipeline", version=build_id, ) print(f"Published pipeline: {published_pipeline.name}") print(f"for build {published_pipeline.version}") # When running in Azure DevOps, set AMLPIPELINE_ID variable # for AML Pipeline task in next job print("Setting Azure DevOps variable") print(f"##vso[task.setvariable variable=AMLPIPELINE_ID;isOutput=true]" f"{published_pipeline.id}")
parser = argparse.ArgumentParser() parser.add_argument("--await_completion", type=bool, default=False) parser.add_argument("--download_outputs", type=bool, default=False) args = parser.parse_args() workspace = Workspace.from_config(auth=AzureCliAuthentication()) # Retrieve datastore/datasets # retrieve datastore datastore_name = 'workspaceblobstore' datastore = Datastore.get(workspace, datastore_name) # data reference baseline_profile = DataReference(datastore, data_reference_name='baselineProfile', path_on_datastore='baseline_profile', mode='download', path_on_compute=None, overwrite=False) # data reference historic_profile = DataReference(datastore, data_reference_name='historicProfile', path_on_datastore='historic_profile', mode='download', path_on_compute=None, overwrite=False) # define data set names input_name_train_sub = 'newsgroups_raw_subset_train' input_name_test_sub = 'newsgroups_raw_subset_test'
"azureml-mlflow==1.5.0", "azureml-defaults==1.5.0" ] ) env.python.conda_dependencies = cd env.register(workspace=ws) print("Registered environment component-condition") # Specify the run configuration run_config = RunConfiguration() run_config.environment.docker.enabled = True run_config.environment.python.conda_dependencies = cd # Pipeline definition inputdata = DataReference( datastore=Datastore.get(ws, "trainingdata"), data_reference_name="data" ) train_model = PythonScriptStep( script_name="./train.py", name="fit-nlp-model", inputs=[inputdata.as_download(path_on_compute="./data")], runconfig=run_config, compute_target=compute_target, ) pipeline = Pipeline( workspace=ws, steps=[train_model], description="Builds Keras model for detecting component defects", )
#step1 cluster_name = "cpucluster" try: compute_target_cpu = ComputeTarget(workspace=ws, name=cluster_name) except ComputeTargetException: compute_config = AmlCompute.provisioning_configuration( vm_size='STANDARD_D3_V2', max_nodes=1, min_nodes=1) compute_target_cpu = ComputeTarget.create(ws, cluster_name, compute_config) compute_target_cpu.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=0) input_data_ref = DataReference( datastore=def_blob_store, data_reference_name="input_data_ref", path_on_datastore=f"{project_config['project_name']}/data/") processed_data_ref = PipelineData("processed_data_ref", datastore=def_blob_store) run_config = RunConfiguration() run_config.environment.docker.enabled = True run_config.environment.docker.base_image = DEFAULT_CPU_IMAGE run_config.environment.python.user_managed_dependencies = False pip_packages = [ "azureml-sdk==1.0.17", "scikit-learn==0.21.3", "download==0.3.4", "pandas==0.25.1", "spacy==2.1.4", "numpy==1.17.2" ] run_config.environment.python.conda_dependencies = CondaDependencies.create(
# ## Pipeline definition # # # The Azure ML pipeline is composed of two steps: # # - Data pre-processing which consist of one-hot encoding categorical features, normalization of the features set, spliting of dataset into training/testing sets and finally writing out the output to storage. # # - Hyperdrive step that tune and train the deep kernel learning model using GPytorch and Pytorch estimator #%% [markdown] # ## Pipeline data input/output # # Here, we define the input and intermediary dataset that will be used by the pipeline steps. #%% input_dir = DataReference(datastore=default_store, data_reference_name="input_data", path_on_datastore="churn") processed_dir = PipelineData(name='processed_data', datastore=default_store) #%% [markdown] # ## Pipeline 1st step: Data Preprocessing # # We start by defining the run configuration with the needed dependencies by the preprocessing step. # # In the cell that follow, we compose the first step of the pipeline. # #%% cd = CondaDependencies() cd.add_conda_package('pandas')
# Configurer des ressources Machine Learning # help(Workspace) ws = Workspace(subscription_id=sub_id, resource_group=res_grp, workspace_name=workspace_n) # Configurer un magasin de données # Default datastore def_data_store = ws.get_default_datastore() # Configurer la référence de données # créer une source de données susceptible d’être référencée dans un pipeline en tant qu’entrée ou étape. # Dans un pipeline, une source de données est représentée par un objet DataReference. from azureml.data.data_reference import DataReference blob_input_data = DataReference(datastore=def_data_store, data_reference_name=data_ref, path_on_datastore=data_filepath) # Les données intermédiaires (ou la sortie d’une étape) sont représentées par un objet PipelineData. from azureml.pipeline.core import PipelineData output_data1 = PipelineData("output_data1", datastore=def_data_store, output_name=model_pklname) # Configurer la cible de calcul # créer une capacité de calcul Azure Machine Learning pour exécuter vos étapes from azureml.core.compute import ComputeTarget, AmlCompute compute_name = "computeuh" vm_size = 'STANDARD_D2_V2' if compute_name in ws.compute_targets: compute_target = ws.compute_targets[compute_name]