def _get_data_references(self, request_id, internal_datastore): print( 'AMLCompute, _get_data_references() called. Request ID: {}'.format( request_id)) # Argument Datastore Name needs to: only contain alphanumeric characters and _. request_id_to_use_for_datastore = request_id.replace('-', '_') try: # setting the overwrite flag to True overwrites any datastore that was created previously with that name # internal_datastore stores all user-facing files: list of images, detection results, list of failed images # and it so happens that each job also needs the list of images as an input internal_datastore_name = 'internal_datastore_{}'.format( request_id_to_use_for_datastore) internal_account_name = internal_datastore['account_name'] internal_account_key = internal_datastore['account_key'] internal_container_name = internal_datastore['container_name'] internal_datastore = Datastore.register_azure_blob_container( self.ws, internal_datastore_name, internal_container_name, internal_account_name, account_key=internal_account_key) print('internal_datastore done') # output_datastore stores the output from score.py in each job, which is another container # in the same storage account as interl_datastore output_datastore_name = 'output_datastore_{}'.format( request_id_to_use_for_datastore) output_container_name = api_config.AML_CONTAINER output_datastore = Datastore.register_azure_blob_container( self.ws, output_datastore_name, output_container_name, internal_account_name, account_key=internal_account_key) print('output_datastore done') except Exception as e: raise RuntimeError( 'Error in connecting to the datastores for AML Compute: {}'. format(str(e))) try: internal_dir = DataReference(datastore=internal_datastore, data_reference_name='internal_dir', mode='mount') output_dir = PipelineData( 'output_{}'.format(request_id_to_use_for_datastore), datastore=output_datastore, output_mode='mount') print('Finished setting up the Data References.') except Exception as e: raise RuntimeError( 'Error in creating data references for AML Compute: {}.'. format(str(e))) return internal_dir, output_dir
def get_blob_datastore(workspace: Workspace, data_store_name: str, storage_name: str, storage_key: str, container_name: str): """ Returns a reference to a datastore Parameters: workspace (Workspace): existing AzureML Workspace object data_store_name (string): data store name storage_name (string): blob storage account name storage_key (string): blob storage account key container_name (string): container name Returns: Datastore: a reference to datastore """ try: blob_datastore = Datastore.get(workspace, data_store_name) print("Found Blob Datastore with name: %s", data_store_name) except HttpOperationError: blob_datastore = Datastore.register_azure_blob_container( workspace=workspace, datastore_name=data_store_name, account_name=storage_name, # Storage account name container_name=container_name, # Name of Azure blob container account_key=storage_key) # Storage account key print("Registered blob datastore with name: %s", data_store_name) return blob_datastore
def register_data_store(work_space, data_store_name, container_name, blob_account_name, blob_account_key, set_default=False): """ register_data_store - register datastore :param str data_store_name: workspace :param str container_name: data store name :param str blob_account_name: data store name :param str blob_account_key: data store name :returns: data_store :rtype: data store object """ data_store = Datastore.register_azure_blob_container( workspace=work_space, datastore_name=data_store_name, container_name=container_name, account_name=blob_account_name, account_key=blob_account_key, create_if_not_exists=True) # Set it to default data store for the AML workspace if set_default: work_space.set_default_datastore(data_store_name) return data_store
def mount_datastores(self, datastore_name, container_name, data_ref_path, data_ref_name=None): res_mngr = ResourceManager(self.args.spn_id, self.args.spn_secret, self.args.tenant_id) self.account_key = res_mngr.get_storage_account_key( self.args.account_name, self.args.subscription_id, self.args.resource_group_name) ds = Datastore.register_azure_blob_container( self.ws, datastore_name, container_name, self.args.account_name, account_key=self.account_key, create_if_not_exists=True) base_mount = ds.path(path=data_ref_path, data_reference_name=data_ref_name).as_mount() return base_mount
def register_dataset(path, aml_interface, storage_acct_name, storage_acct_key): workspace = aml_interface.workspace datastore = Datastore.register_azure_blob_container( workspace=workspace, datastore_name='prediction', container_name='prediction', account_name=storage_acct_name, account_key=storage_acct_key) prediction_datastore = Datastore.get(workspace, 'prediction') datastore_path = [(prediction_datastore, path)] dataset = Dataset.Tabular.from_delimited_files(path=datastore_path) dataset = dataset.register(workspace=aml_interface.workspace, name='Prediction')
def config(ws, blob_datastore_name, account_name, container_name, account_key): try: blob_datastore = Datastore.get(ws, blob_datastore_name) print("Found Blob Datastore with name: %s" % blob_datastore_name) except HttpOperationError: blob_datastore = Datastore.register_azure_blob_container( workspace=ws, datastore_name=blob_datastore_name, account_name=account_name, # Storage account name container_name=container_name, # Name of Azure blob container account_key=account_key) # Storage account key print("Registered blob datastore with name: %s" % blob_datastore_name) return blob_datastore
def createDataReference(workspace, storage_name, storage_key, storage_container_name, data_store_name, data_reference_name): ''' If no present, registers a new azureml.core.datastore.Datastore Once the data store is in hand it creates an instance of azureml.data.data_reference.DataReference that can be used in an Azure ML pipeline step. PARAMS: workspace : azureml.core.Workspace : Existing AMLS Workspace storage_name : string : Name of the Azure Storage Account storage_key : string : Access Key to the Azure Storage Account storage_container_name : string : Container name to recieve blobs. Must exist data_store_name : string : Name of the registere data store. data_reference_name : string : Name of the data reference RETURNS: tuple(azureml.core.datastore.Datastore, azureml.data.data_reference.DataReference) ''' data_store = None try: data_store = Datastore.get(workspace, data_store_name) print("Found existing data store - ", data_store_name) except Exception as ex: print("Creating data store - ", data_store_name) data_store = Datastore.register_azure_blob_container( workspace, datastore_name=data_store_name, container_name=storage_container_name, account_name=storage_name, account_key=storage_key, ) if data_store == None: raise Exception("Could not create/find data store.") return data_store, DataReference(datastore=data_store, data_reference_name=data_reference_name)
def create_and_attach_blob_storage(cfg, ws): """ If required, creates the blob storage containers in the datareferences of cfg """ if len(cfg.DataReference.localDirectoryBlobList) > 0: for ref in cfg.DataReference.localDirectoryBlobList: log.info("Attempting to create Blob Container '%s' on storage account '%s'.", ref.remoteBlobContainer, ref.storageAccountName) blob_service = BlockBlobService(ref.storageAccountName, ref.storageAccountKey) exist = blob_service.create_container(ref.remoteBlobContainer, fail_on_exist=False) if exist: log.info("Blob Container '%s' on storage account '%s' created.", ref.remoteBlobContainer, ref.storageAccountName) else: log.info("Blob Container '%s' on storage account '%s' already existed.", ref.remoteBlobContainer, ref.storageAccountName) # Get most recent list of datastores linked to current workspace datastores = ws.datastores() # Validate if blob_ds is created ds = None if ref.dataref_id not in datastores else Datastore(workspace = ws, name = ref.dataref_id) # If DS exists and isn't mapped to the right place if ds: if ds.account_name == ref.storageAccountName and ds.container_name == ref.remoteBlobContainer: recreate = False else: recreate = True # also remove the existing reference ds.unregister() else: recreate = True if recreate: log.info('Registering blob "{}" to AML datastore for AML workspace "{}" under datastore id "{}".'.format(ref.remoteBlobContainer, ws.name, ref.dataref_id)) ds = Datastore.register_azure_blob_container(workspace = ws, datastore_name = ref.dataref_id, container_name = ref.remoteBlobContainer, account_name = ref.storageAccountName, account_key = ref.storageAccountKey, overwrite = True, # Overwrites the datastore (not the data itself, the object) if it already is part of this workspace ) else: log.info('Blob "{}" under AML workspace "{}" already registered under datastore id "{}".'.format(ref.remoteBlobContainer, ws.name, ref.dataref_id))
def main(): """ Builds the Azure ML pipeline for data engineering and model training. """ databricks_workspace_name = os.environ['DATABRICKS_WORKSPACE_NAME'] training_data_account_name = os.environ['TRAINING_DATA_ACCOUNT_NAME'] build_id = os.getenv('BUILD_BUILDID', 0) # Get Azure machine learning workspace aml_workspace = Workspace.get( name=os.environ['AML_WORKSPACE_NAME'], subscription_id=os.environ['SUBSCRIPTION_ID'], resource_group=os.environ['RESOURCE_GROUP'], ) print(aml_workspace) # Generate Databricks credentials, see https://aka.ms/databricks-aad dbricks_region = aml_workspace.location dbricks_api = f"https://{dbricks_region}.azuredatabricks.net/api/2.0" dbricks_client = databricks_client.create(dbricks_api) dbricks_client.auth_azuread(resource_group=aml_workspace.resource_group, workspace_name=databricks_workspace_name) dbricks_client.ensure_available() # Attach Databricks as Azure ML training compute dbricks_compute_name = "databricks" dbricks_compute = get_databricks_compute( aml_workspace, dbricks_compute_name, ) if dbricks_compute is None: pat_token = dbricks_client.post( 'token/create', json={"comment": "Azure ML Token generated by Build " + build_id })['token_value'] dbricks_compute = create_databricks_compute( aml_workspace, databricks_workspace_name, dbricks_compute_name, pat_token, ) print("dbricks_compute:") print(dbricks_compute) # Create Databricks instance pool pool_name = "azureml_training" instance_pool_id = get_instance_pool(dbricks_client, pool_name) if not instance_pool_id: dbricks_client.post('instance-pools/create', json={ "instance_pool_name": pool_name, "node_type_id": "Standard_D3_v2", "idle_instance_autotermination_minutes": 10, "preloaded_spark_versions": [DATABRICKS_RUNTIME_VERSION], }) instance_pool_id = get_instance_pool(dbricks_client, pool_name) notebook_folder = f"/Shared/AzureMLDeployed" workspace_datastore = Datastore(aml_workspace, "workspaceblobstore") # Create a datastore for the training data container credentials, subscription = get_azure_cli_credentials() storage_client = StorageManagementClient(credentials, subscription) training_storage_keys = storage_client.storage_accounts.list_keys( aml_workspace.resource_group, training_data_account_name) training_datastore = Datastore.register_azure_blob_container( workspace=aml_workspace, datastore_name="trainingdata", container_name="trainingdata", account_name=training_data_account_name, account_key=training_storage_keys.keys[0].value, ) # FEATURE ENGINEERING STEP (DATABRICKS) # Create feature engineering pipeline step training_data_input = DataReference(datastore=training_datastore, path_on_datastore="/", data_reference_name="training") feature_eng_output = PipelineData("feature_engineered", datastore=workspace_datastore) notebook_path = upload_notebook(dbricks_client, notebook_folder, "code/prepare", "feature_engineering") training_dataprep_step = DatabricksStep( name="FeatureEngineering", inputs=[training_data_input], outputs=[feature_eng_output], spark_version=DATABRICKS_RUNTIME_VERSION, instance_pool_id=instance_pool_id, num_workers=3, notebook_path=notebook_path, run_name="FeatureEngineering", compute_target=dbricks_compute, allow_reuse=True, ) # You can add Azure ML model training tasks using # feature_eng_output as input. # ... # Create Azure ML Pipeline steps = [training_dataprep_step] ml_pipeline = Pipeline(workspace=aml_workspace, steps=steps) ml_pipeline.validate() published_pipeline = ml_pipeline.publish( name="Feature Engineering", description="Feature engineering pipeline", version=build_id, ) print(f"Published pipeline: {published_pipeline.name}") print(f"for build {published_pipeline.version}") # When running in Azure DevOps, set AMLPIPELINE_ID variable # for AML Pipeline task in next job print("Setting Azure DevOps variable") print(f"##vso[task.setvariable variable=AMLPIPELINE_ID;isOutput=true]" f"{published_pipeline.id}")
"BLOB_ACCOUNTNAME_62", "PUT YOUR STORAGE ACCOUNT NAME HERE") # Storage account name container_name = os.getenv( "BLOB_CONTAINER_62", "PUT YOUR STORAGE CONTAINER NAME HERE") # Name of Azure blob container account_key = os.getenv( "BLOB_ACCOUNT_KEY_62", "PUT YOUR STORAGE ACCOUNT KEY HERE") # Storage account key try: blob_datastore = Datastore.get(ws, blob_datastore_name) print("Found Blob Datastore with name: %s" % blob_datastore_name) except: blob_datastore = Datastore.register_azure_blob_container( workspace=ws, datastore_name=blob_datastore_name, account_name=account_name, # Storage account name container_name=container_name, # Name of Azure blob container account_key=account_key) # Storage account key print("Registered blob datastore with name: %s" % blob_datastore_name) blob_data_ref = DataReference(datastore=blob_datastore, data_reference_name="blob_test_data", path_on_datastore="testdata") csv_path = (blob_datastore, '/creditcard.csv') try: tab_ds = Dataset.Tabular.from_delimited_files(path=csv_path) tab_ds = tab_ds.register(workspace=ws, name='creditcard') except Exception as ex: print(ex) else:
from azureml.core import Workspace ws = Workspace.from_config() from azureml.core.datastore import Datastore batchscore_blob = Datastore.register_azure_blob_container( ws, datastore_name="images_datastore", container_name="sampledata", account_name="pipelinedata", overwrite=True) def_data_store = ws.get_default_datastore() from azureml.core.dataset import Dataset from azureml.pipeline.core import PipelineData input_images = Dataset.File.from_files( (batchscore_blob, "batchscoring/images/")) label_ds = Dataset.File.from_files((batchscore_blob, "batchscoring/labels/")) output_dir = PipelineData(name="scores", datastore=def_data_store, output_path_on_compute="batchscoring/results") input_images = input_images.register(workspace=ws, name="input_images") label_ds = label_ds.register(workspace=ws, name="label_ds") from azureml.core.model import Model model = Model(ws, 'tf-dnn-mnist')
ws = Workspace.create(subscription_id=azureSubscriptionID, resource_group=azureResourceGroup, name=azureMLWorkSpaceName, location=azureMLWorkSpaceLocation) else: ws = Workspace.get(azureMLWorkSpaceName, subscription_id=azureSubscriptionID) # create or use an existing experiment exp = Experiment(workspace=ws, name=experiment_name) # register our existing Azure Blob Container with the labled audio files ds = Datastore.register_azure_blob_container( workspace=ws, datastore_name=azureStorageTargetContainer, container_name=azureStorageTargetContainer, account_name=azureStorgeAccountName, account_key=azureStorageKeyName, create_if_not_exists=False) # create a reference where we mount the DataStore to the container instance dr = DataReferenceConfiguration(datastore_name=ds.name, path_on_compute='data', mode='mount') # upload any needed files ws.get_default_datastore().upload(src_dir='.', target_path='.', overwrite=True, show_progress=True)
datastore_name = "bearing_datastore" dataset_name = "bearing_dataset" container_name = "bearingdata" sensor_data = pd.DataFrame() ws = get_workspace() try: datastore = Datastore.get(ws, datastore_name) print("Datastore found: ", datastore_name) except Exception: datastore = Datastore.register_azure_blob_container( workspace=ws, datastore_name=datastore_name, account_name=os.environ.get('AML_BLOB_ACCOUNT_NAME'), container_name=container_name, account_key=os.environ.get('AML_BLOB_ACCOUNT_KEY'), endpoint="core.chinacloudapi.cn") print("Datastore registered: ", datastore_name) for filename in os.listdir(raw_data_dir): data = pd.read_csv(os.path.join(raw_data_dir, filename), names=["c1", "c2", "c3", "c4"], sep='\t') data_mean = np.array(data.abs().mean()) data_mean = pd.DataFrame(data_mean.reshape(1, 4)) data_mean.index = [pd.to_datetime(filename, format='%Y.%m.%d.%H.%M.%S')] sensor_data = sensor_data.append(data_mean) print('datapoints appended: ', filename)
# COMMAND ---------- blob_datastore_name = 'dsblob' subscription_id="7e48a1e8-8d3e-4e00-8bc0-098c43f5ace7" resource_group="ADL_RESOURCE_GROUP", "rgSampleData" account_name="sasampledata" tenant_id="72f988bf-86f1-41af-91ab-2d7cd011db47" client_id="2a81532b-016b-4c0e-aa43-bd9b97fbdaba" client_secret="qlcF52cl2bo0[Nmo@-KuuVlNF[L9Ucs/" account_key = "p5+k7W6bv9OIrCKpOA+p2Lbu8rrm+6D9eb5Fyv3hqO8j1GqmsYupeztdeaefzG7wScuugVbtGPrJn5BBZCqRsg==" adlsgen2_datastore = Datastore.register_azure_blob_container(workspace=ws, datastore_name=blob_datastore_name, account_name=account_name, # ADLS Gen2 account name container_name ='nyctaxi', # ADLS Gen2 filesystem account_key = account_key) # COMMAND ---------- print(dsNYCTaxi.datastore_type) # COMMAND ---------- from azureml.pipeline.core import PipelineParameter,Pipeline, PipelineData from azureml.data.data_reference import DataReference # Use the default blob storage dsNYCTaxi = Datastore.get(ws, "dsblob") print('Datastore {} will be used'.format(dsNYCTaxi.name))