def register_dataset(dataset_name, dataframe): dataset_config = next( iter( filter(lambda x: x["name"] == dataset_name, self.output_reg_datasets))) datastore = dataset_config.get("datastore") or "default" description = dataset_config.get("description") tags = dataset_config.get("tags") if datastore == "default": ds = ws.get_default_datastore() else: ds = Datastore.get(workspace=ws, datastore_name=datastore) target_path = f'experiment/{run.experiment.name}/run/{run.number}/out/{dataset_name}' default_output_dataset_tags = { "format": self. OUTPUT_FORMAT, # Dataset.Tabular.register_pandas_dataframe always writes a parquet "experiment": run.experiment.name, "run": run.number } output_dataset_tags = {**default_output_dataset_tags, **tags} Dataset.Tabular.register_pandas_dataframe( dataframe, target=(ds, target_path), name=dataset_name, description=description, tags=output_dataset_tags)
def get_blob_datastore(workspace: Workspace, data_store_name: str, storage_name: str, storage_key: str, container_name: str): """ Returns a reference to a datastore Parameters: workspace (Workspace): existing AzureML Workspace object data_store_name (string): data store name storage_name (string): blob storage account name storage_key (string): blob storage account key container_name (string): container name Returns: Datastore: a reference to datastore """ try: blob_datastore = Datastore.get(workspace, data_store_name) print("Found Blob Datastore with name: %s", data_store_name) except HttpOperationError: blob_datastore = Datastore.register_azure_blob_container( workspace=workspace, datastore_name=data_store_name, account_name=storage_name, # Storage account name container_name=container_name, # Name of Azure blob container account_key=storage_key) # Storage account key print("Registered blob datastore with name: %s", data_store_name) return blob_datastore
def get_ds_object(ws, name): """ get_ds_object - Get workspace datastore object :param str ws: workspace :param str name: data store name :returns: ws, name :rtype: blob object, str """ return Datastore.get(ws, name)
def register_dataset(path, aml_interface, storage_acct_name, storage_acct_key): workspace = aml_interface.workspace datastore = Datastore.register_azure_blob_container( workspace=workspace, datastore_name='prediction', container_name='prediction', account_name=storage_acct_name, account_key=storage_acct_key) prediction_datastore = Datastore.get(workspace, 'prediction') datastore_path = [(prediction_datastore, path)] dataset = Dataset.Tabular.from_delimited_files(path=datastore_path) dataset = dataset.register(workspace=aml_interface.workspace, name='Prediction')
def get_or_create_dataset(azure_config: AzureConfig, azure_dataset_id: str) -> Dataset: """ Looks in the AzureML datastore for a dataset of the given name. If there is no such dataset, a dataset is created and registered, assuming that the files are in a folder that has the same name as the dataset. For example, if azure_dataset_id is 'foo', then the 'foo' dataset is pointing to <container_root>/datasets/foo folder. WARNING: the behaviour of Dataset.File.from_files, used below, is idiosyncratic. For example, if "mydataset" storage has two "foo..." subdirectories each containing a file dataset.csv and a directory ABC, datastore = Datastore.get(workspace, "mydataset") # This dataset has the file(s) in foo-bar01 at top level, e.g. dataset.csv ds1 = Dataset.File.from_files([(datastore, "foo-bar01/*")]) # This dataset has two directories at top level, each with a name matching foo-bar*, and each # containing dataset.csv. ds2 = Dataset.File.from_files([(datastore, "foo-bar*/*")]) # This dataset contains a single directory "mydataset" at top level, containing a subdirectory # foo-bar01, containing dataset.csv and (part of) ABC. ds3 = Dataset.File.from_files([(datastore, "foo-bar01/*"), (datastore, "foo-bar01/ABC/abc_files/*/*.nii.gz")]) These behaviours can be verified by calling "ds.download()" on each dataset ds. """ if not azure_config.azureml_datastore: raise ValueError( "No value set for 'azureml_datastore' (name of the datastore in the AzureML workspace)" ) logging.info( f"Retrieving datastore '{azure_config.azureml_datastore}' from AzureML workspace" ) workspace = azure_config.get_workspace() datastore = Datastore.get(workspace, azure_config.azureml_datastore) try: logging.info( f"Trying to retrieve AzureML Dataset '{azure_dataset_id}'") azureml_dataset = Dataset.get_by_name(workspace, name=azure_dataset_id) logging.info("Dataset found.") except: logging.info( f"Dataset does not yet exist, creating a new one from data in folder '{azure_dataset_id}'" ) # See WARNING above before changing the from_files call! azureml_dataset = Dataset.File.from_files([(datastore, azure_dataset_id)]) logging.info("Registering the dataset for future use.") azureml_dataset.register(workspace, name=azure_dataset_id) return azureml_dataset
def config(ws, blob_datastore_name, account_name, container_name, account_key): try: blob_datastore = Datastore.get(ws, blob_datastore_name) print("Found Blob Datastore with name: %s" % blob_datastore_name) except HttpOperationError: blob_datastore = Datastore.register_azure_blob_container( workspace=ws, datastore_name=blob_datastore_name, account_name=account_name, # Storage account name container_name=container_name, # Name of Azure blob container account_key=account_key) # Storage account key print("Registered blob datastore with name: %s" % blob_datastore_name) return blob_datastore
def createDataReference(workspace, storage_name, storage_key, storage_container_name, data_store_name, data_reference_name): ''' If no present, registers a new azureml.core.datastore.Datastore Once the data store is in hand it creates an instance of azureml.data.data_reference.DataReference that can be used in an Azure ML pipeline step. PARAMS: workspace : azureml.core.Workspace : Existing AMLS Workspace storage_name : string : Name of the Azure Storage Account storage_key : string : Access Key to the Azure Storage Account storage_container_name : string : Container name to recieve blobs. Must exist data_store_name : string : Name of the registere data store. data_reference_name : string : Name of the data reference RETURNS: tuple(azureml.core.datastore.Datastore, azureml.data.data_reference.DataReference) ''' data_store = None try: data_store = Datastore.get(workspace, data_store_name) print("Found existing data store - ", data_store_name) except Exception as ex: print("Creating data store - ", data_store_name) data_store = Datastore.register_azure_blob_container( workspace, datastore_name=data_store_name, container_name=storage_container_name, account_name=storage_name, account_key=storage_key, ) if data_store == None: raise Exception("Could not create/find data store.") return data_store, DataReference(datastore=data_store, data_reference_name=data_reference_name)
if 'creditcard' not in ws.datasets: #Set blobdatastore blob_datastore_name = 'MyBlobDatastore' account_name = os.getenv( "BLOB_ACCOUNTNAME_62", "PUT YOUR STORAGE ACCOUNT NAME HERE") # Storage account name container_name = os.getenv( "BLOB_CONTAINER_62", "PUT YOUR STORAGE CONTAINER NAME HERE") # Name of Azure blob container account_key = os.getenv( "BLOB_ACCOUNT_KEY_62", "PUT YOUR STORAGE ACCOUNT KEY HERE") # Storage account key try: blob_datastore = Datastore.get(ws, blob_datastore_name) print("Found Blob Datastore with name: %s" % blob_datastore_name) except: blob_datastore = Datastore.register_azure_blob_container( workspace=ws, datastore_name=blob_datastore_name, account_name=account_name, # Storage account name container_name=container_name, # Name of Azure blob container account_key=account_key) # Storage account key print("Registered blob datastore with name: %s" % blob_datastore_name) blob_data_ref = DataReference(datastore=blob_datastore, data_reference_name="blob_test_data", path_on_datastore="testdata") csv_path = (blob_datastore, '/creditcard.csv')
import argparse from pathlib import Path from azureml.core.datastore import Datastore from azureml.core.workspace import Workspace if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--subscription-id", type=str) parser.add_argument("--resource-group", type=str) parser.add_argument("--workspace-name", type=str) parser.add_argument("--datastore-name", type=str) parser.add_argument("--data-directory", type=str) parser.add_argument("--dataset-name", type=str) args = parser.parse_args() print(args.workspace_name) workspace = Workspace( subscription_id=args.subscription_id, resource_group=args.resource_group, workspace_name=args.workspace_name, ) datastore = Datastore.get(workspace, args.datastore_name) local_path = Path(args.data_directory) for phase in ["train", "val"]: local_directory = str(local_path / phase) target_path = str(Path(args.dataset_name) / phase) datastore.upload(local_directory, target_path=target_path, show_progress=True)
try: compute_target = ComputeTarget(workspace=ws, name=cluster_name) print('Found existing compute target') except ComputeTargetException: print('No compute cluster named {}'.format(cluster_name)) exit() curated_env_name = 'Resnet50v15-CPU-cluster' # pytorch_env = Environment.get(workspace=ws, name=curated_env_name) pytorch_env = Environment.from_conda_specification( name=curated_env_name, file_path='./conda_dependencies.yml') project_folder = './' data_path = 'datasets' datastore = Datastore.get(ws, 'workspaceblobstore') dataset = Dataset.File.from_files(path=(datastore, data_path)) data_loc = dataset.as_named_input('input').as_mount() src = ScriptRunConfig( source_directory=project_folder, # command=['ls'], script='train_resnet.py', arguments=[ '--num_epochs', 16, '--batch', '32', '--shuffle', 'True', '--dataloc',
import os from pathlib import Path from utils import get_workspace raw_data_dir = "C:\\Dataspace\\IMS\\2nd_test" prep_data_dir = "C:\\Dataspace\\IMS\\processed\\2nd_test" datastore_name = "bearing_datastore" dataset_name = "bearing_dataset" container_name = "bearingdata" sensor_data = pd.DataFrame() ws = get_workspace() try: datastore = Datastore.get(ws, datastore_name) print("Datastore found: ", datastore_name) except Exception: datastore = Datastore.register_azure_blob_container( workspace=ws, datastore_name=datastore_name, account_name=os.environ.get('AML_BLOB_ACCOUNT_NAME'), container_name=container_name, account_key=os.environ.get('AML_BLOB_ACCOUNT_KEY'), endpoint="core.chinacloudapi.cn") print("Datastore registered: ", datastore_name) for filename in os.listdir(raw_data_dir): data = pd.read_csv(os.path.join(raw_data_dir, filename), names=["c1", "c2", "c3", "c4"], sep='\t')
datastore_name=blob_datastore_name, account_name=account_name, # ADLS Gen2 account name container_name ='nyctaxi', # ADLS Gen2 filesystem account_key = account_key) # COMMAND ---------- print(dsNYCTaxi.datastore_type) # COMMAND ---------- from azureml.pipeline.core import PipelineParameter,Pipeline, PipelineData from azureml.data.data_reference import DataReference # Use the default blob storage dsNYCTaxi = Datastore.get(ws, "dsblob") print('Datastore {} will be used'.format(dsNYCTaxi.name)) #pipeline_param = PipelineParameter(name="my_pipeline_param", default_value="pipeline_param1") datasetFilePath = DataReference(datastore=dsNYCTaxi, path_on_datastore="/merged_aml_dbr", data_reference_name="datasetFilePath") output = PipelineData("output", datastore=dsNYCTaxi) # COMMAND ---------- from azureml.pipeline.steps import DatabricksStep from azureml.core.databricks import PyPiLibrary