def upload_dataset(self, dataset_name: str, local_folder: str, datastore_name: str = None, overwrite: bool = False, tags: dict = None) -> pd.DataFrame: ''' Uploads data from a local directory into an AzureML Datastore that points to Azure Data lake Args: dataset_name (str): The name of the dataset to register local_folder (str): The location of the local directory to take files from datastore_path (str): The name of a DataStore that will contain the dataset Returns: FileDataset: The registered dataset, containing the files ''' if not datastore_name: # No datastore name is given, so we'll take the default one datastore_name = self.__datastore_path # Connecting data store datastore = Datastore(self.__workspace, name=datastore_name) # TODO : check type of datastore datastore.upload(local_folder, dataset_name, overwrite, True) datastore_paths = [(datastore, dataset_name)] file_ds = Dataset.File.from_files(path=datastore_paths) file_ds = file_ds.register(workspace=self.__workspace, name=dataset_name, description=dataset_name, tags = tags, create_new_version=True)
def __exit__(self, *exc_details): """Upload files for datastore. :param exc_details: :return: """ from azureml.core.datastore import Datastore from azureml.data._dataprep_helper import dataprep module_logger.debug("Enter __exit__ function of datastore cmgr") for key, value in self._config.items(): df_config, force_read = self._to_data_reference_config(value) if self._is_upload(df_config): self._validate_config(df_config, key) ds = Datastore(workspace=self._workspace, name=df_config.data_store_name) if os.path.isdir(df_config.path_on_compute): if self._is_datastore_adlsgen1(ds): module_logger.debug( "AzureDataLake Gen1 used as Datastore for upload dir." ) dataprep().api.engineapi.api.get_engine_api( ).upload_directory( dataprep().api.engineapi.typedefinitions. UploadDirectoryMessageArguments( base_path=df_config.path_on_compute, folder_path=df_config.path_on_compute, destination=dataprep( ).api._datastore_helper._to_stream_info_value( ds, df_config.path_on_data_store), force_read=force_read, overwrite=df_config.overwrite, concurrent_task_count=1)) else: ds.upload(src_dir=df_config.path_on_compute, target_path=df_config.path_on_data_store, overwrite=df_config.overwrite) elif os.path.isfile(df_config.path_on_compute): if self._is_datastore_adlsgen1(ds): module_logger.debug( "AzureDataLake Gen1 used as Datastore for upload file." ) dataprep().api.engineapi.api.get_engine_api( ).upload_file( dataprep().api.engineapi.typedefinitions. UploadFileMessageArguments( base_path=os.path.dirname( df_config.path_on_compute), local_path=df_config.path_on_compute, destination=dataprep( ).api._datastore_helper._to_stream_info_value( ds, df_config.path_on_data_store), force_read=force_read, overwrite=df_config.overwrite)) else: ds.upload_files( files=[df_config.path_on_compute], target_path=df_config.path_on_data_store, overwrite=df_config.overwrite) module_logger.debug("Exit __exit__ function of datastore cmgr")
def get_by_data_reference(cls, workspace, path): data_store = Datastore(workspace, cls.DEFAULT_GLOBAL_DATASET_STORE) return DataReference( datastore=data_store, data_reference_name=cls.DEFAULT_DATA_REFERENCE_NAME, path_on_datastore=path, )
def load_tabular_partition(self, partition_name: str, datastore_name: str = None, columns: np.array = None, first_row_header: bool = False, cloud_storage: bool = True) -> pd.DataFrame: ''' Loads a partition from a tabular dataset. The implementation will connect to the DataStore and get all delimited files matching the partition_name When configured locally, the implementation will append all files in the datastore_path with name {partition_name}.csv Args: partition_name (str): The name of the partition as a wildcard filter. Example: B* will take all files starting with B, ending with csv columns: (np.array): The column names to assign to the dataframe datastore_path (str): The name of a DataStore that contains Datasets cloud_storage (bool): When changed to False, the dataset will be loaded from the local folder Returns: pd.DataFrame: The dataset, loaded as a DataFrame ''' if not datastore_name: # No datastore name is given, so we'll take the default one datastore_name = self.__datastore_path if cloud_storage: # Connecting data store datastore = Datastore(self.__workspace, name=datastore_name) try: _header = PromoteHeadersBehavior.ALL_FILES_HAVE_SAME_HEADERS if first_row_header else False _aml_dataset = Dataset.Tabular.from_delimited_files( header=_header, path=DataPath(datastore, '/' + partition_name + '.csv')) #, set_column_types=columns _df = _aml_dataset.to_pandas_dataframe() except DatasetValidationError as dsvalex: if 'provided path is not valid' in str(dsvalex): return None else: raise else: # Reading data from sub files in a folder _folder_path = datastore_name _partition_files = glob.glob(_folder_path + '/' + partition_name + '.csv') _record_found = False _df = None for filename in _partition_files: _header = 0 if first_row_header else None df = pd.read_csv(filename, index_col=None, header=_header) if not _record_found: _df = df _record_found = True else: _df = _df.append(df) if not _record_found: return None if columns != None: _df.columns = columns return _df
def _get_datastore_and_path(self, config): from azureml.core import Datastore output_location = config["OutputLocation"] data_path = output_location["DataPath"] datastore = Datastore(self._workspace, data_path["DatastoreName"]) return datastore, data_path["RelativePath"]
def __enter__(self): """Download files for datastore. :return: """ module_logger.debug("Enter __enter__ function of datastore cmgr") from azureml.core import Datastore, Dataset for key, value in self._config.items(): df_config, _ = self._to_data_reference_config(value) if self._is_upload(df_config): if df_config.path_on_compute: dir_to_create = os.path.normpath( os.path.dirname(df_config.path_on_compute)) if dir_to_create: _safe_mkdirs(dir_to_create) else: target_path = df_config.data_store_name if df_config.path_on_compute: target_path = os.path.join(df_config.data_store_name, df_config.path_on_compute) # The target_path is always set using the data store name with no way # for the user to overwrite this behavior. The user might attempt to use ../ in # the path on compute as a solution but this throws an exception # because the path is not normalized. # Normalizing the path to allow the user to use up-level references. target_path = os.path.normpath(target_path) if self._is_download(df_config): self._validate_config(df_config, key) ds = Datastore(workspace=self._workspace, name=df_config.data_store_name) if self._is_datastore_adlsgen1(ds): _log_and_print( "AzureDataLake Gen1 used as Datastore for download" ) if df_config.path_on_data_store is None: df_config.path_on_data_store = "" Dataset.File.from_files( (ds, df_config.path_on_data_store)).download( os.path.join(target_path, df_config.path_on_data_store), overwrite=df_config.overwrite) else: count = ds.download( target_path=target_path, prefix=df_config.path_on_data_store, overwrite=df_config.overwrite) if count == 0: import warnings warnings.warn( "Downloaded 0 files from datastore {} with path {}." .format(ds.name, df_config.path_on_data_store)) else: _safe_mkdirs(target_path) module_logger.debug("Exit __enter__ function of datastore cmgr")
def submit_pipeline( workspace=None, # Auto populated args + object pipeline_id=None, experiment_name=None, pipeline_yaml=None, pipeline_params=None, datapath_params=None, output_file=None, # We enforce a logger logger=None): """ Submit a pipeline run based on a published pipeline ID """ if pipeline_id is None and pipeline_yaml is None: raise UserErrorException("Please specify a pipeline ID or a pipeline YAML file") published_pipeline = None pipeline = None if pipeline_id is not None: from azureml.pipeline.core import PublishedPipeline published_pipeline = PublishedPipeline.get(workspace, pipeline_id) if experiment_name is None or experiment_name == '': # Use the pipeline name as the experiment name experiment_name = published_pipeline._sanitize_name() else: from azureml.pipeline.core import Pipeline pipeline = Pipeline.load_yaml(workspace, pipeline_yaml) if experiment_name is None: raise UserErrorException("Please specify an experiment name") assigned_params = _parse_key_values(pipeline_params, 'Parameter assignment') datapaths = _parse_key_values(datapath_params, 'Datapath assignment') for datapath_param_name in datapaths: datastore_with_path = datapaths[datapath_param_name] if '/' not in datastore_with_path: raise UserErrorException("Datapath value %s should have format datastore/path" % datastore_with_path) path_tokens = datastore_with_path.split('/', 1) from azureml.core import Datastore from azureml.data.datapath import DataPath datastore = Datastore(workspace, path_tokens[0]) assigned_params[datapath_param_name] = DataPath(datastore=datastore, path_on_datastore=path_tokens[1]) dict_output = _pipeline_run_submit(experiment_name, assigned_params, published_pipeline, pipeline, workspace, output_file, logger) return dict_output
def main(args): # Load workspace print("Loading Workspace") workspace = Workspace.from_config() print( f"Workspace name: {workspace.name}", f"Azure region: {workspace.location}", f"Subscription id: {workspace.subscription_id}", f"Resource group: {workspace.resource_group}", sep="\n" ) # Printing all datastores print("Printing all datastores") for name, datastore in workspace.datastores.items(): print(name, datastore.datastore_type, sep="\t") # Load datastore print("Loading datastore") datastore = Datastore( workspace=workspace, name=args.datastore_name ) # Upload dataset print("Uploading dataset") datastore.upload_files( files=["./train_dataset/iris.csv"], target_path="train_dataset/iris.csv", overwrite=True, show_progress=True ) # Register dataset file_dataset = Dataset.File.from_files( )
def register_dataset(path, system, platform, environment, start_date, end_date, secret_scope, datastore_name="dataprep"): # TODO: move parameters to Azure Key Vault sp_auth = ServicePrincipalAuthentication( tenant_id=dbutils.secrets.get(scope=secret_scope, key="tenant_id"), service_principal_id=dbutils.secrets.get(scope=secret_scope, key="service_principal_id"), service_principal_password=dbutils.secrets.get( scope=secret_scope, key="service_principal_password")) ws = Workspace(subscription_id=parse_arg("--AZUREML_ARM_SUBSCRIPTION"), resource_group=parse_arg("--AZUREML_ARM_RESOURCEGROUP"), workspace_name=parse_arg("--AZUREML_ARM_WORKSPACE_NAME"), auth=sp_auth) datastore = Datastore(workspace=ws, name=datastore_name) file_dataset = Dataset.File.from_files( path=[(datastore, f"{path}/tag_name=*/*.parquet")]) system_name_clean = system_clean = system.replace(" ", "") file_dataset = file_dataset.register( workspace=ws, name=f"{system_name_clean}", description=f"{system_name_clean} dataset", tags={ "system": system, "platform": platform, "environment": environment, "start_date": start_date, "end_date": end_date }, create_new_version=True) return file_dataset
leftcolumns='Survived;Pclass;Name', rightcolumns='Sex;Age;SibSp;Parch;Ticket;Fare;Cabin;Embarked', leftkeys='PassengerId', rightkeys='PassengerId', jointype='HashInner' ).set_inputs( left_input=input1, right_input=input2 ) # Configure inputs ejoin.inputs.leftinput.configure(mode='mount') print(ejoin.inputs.leftinput.mode) # Configure outputs ejoin.outputs.ejoin_output.configure(output_mode='mount', datastore=Datastore(ws, name="myownblob")) print(ejoin.outputs.ejoin_output.output_mode) print(ejoin.outputs.ejoin_output.datastore.name) eselect = eselect_module_func( columns='Survived;Name;Sex;Age', input=ejoin.outputs.ejoin_output ) # pipeline pipeline = Pipeline(nodes=[ejoin, eselect], outputs=eselect.outputs, default_compute_target='aml-compute') # In[ ]:
def create_pipeline(workspace): # Retreive compute cluster compute_target = workspace.compute_targets[args.compute_target] # Setup batch scoring environment from conda dependencies environment = Environment.from_conda_specification( name=args.environment_name, file_path=args.environment_specification ) # Add environment variables environment.environment_variables = { "APPLICATIONINSIGHTS_CONNECTION_STRING": args.ai_connection_string } # Enable docker run environment.docker.enabled = True # Create run config run_config = RunConfiguration() run_config.environment = environment # Retreive input and output datastores input_datastore = Datastore(workspace, args.input_datastore_name) output_datastore = Datastore(workspace, args.output_datastore_name) # Define build id parameter build_id_param = PipelineParameter("build_id", default_value=args.build_id) # Define input datapath parameter input_datapath = DataPath(datastore=input_datastore, path_on_datastore="") input_datapath_param = ( PipelineParameter(name="input_datapath", default_value=input_datapath), DataPathComputeBinding(mode="mount"), ) # Define output datapath parameter output_datapath = DataPath(datastore=output_datastore, path_on_datastore="") output_datapath_param = ( PipelineParameter(name="output_datapath", default_value=output_datapath), DataPathComputeBinding(mode="mount"), ) # Define score step for pipeline score_step = PythonScriptStep( name="score_data", compute_target=compute_target, source_directory="src/score", script_name="score.py", inputs=[input_datapath_param, output_datapath_param], runconfig=run_config, allow_reuse=False, arguments=[ "--build_id", build_id_param, "--input_datapath", input_datapath_param, "--output_datapath", output_datapath_param, ], ) # Define pipeline for batch scoring pipeline = Pipeline(workspace=workspace, steps=[score_step]) return pipeline
}, delimiter=",", header=None) df_log.columns = [ 'ModelType', 'FileName', 'ModelName', 'StartTime', 'EndTime', 'Duration', 'Index', 'BatchSize', 'Status' ] df_log['ModelType'] = df_log['ModelType'].apply(str).str.replace("'", '') df_log['FileName'] = df_log['FileName'].apply(str).str.replace("'", '') df_log['ModelName'] = df_log['ModelName'].apply(str).str.replace("'", '') df_log['StartTime'] = df_log['StartTime'].apply(str).str.replace("'", '') df_log['EndTime'] = df_log['EndTime'].apply(str).str.replace("'", '') df_log['Duration'] = df_log['Duration'].apply(str).str.replace("'", '') df_log['Status'] = df_log['Status'].apply(str).str.replace("'", '') print(df_log.head()) print('Read and cleaned the log file') # save the log file output_path = os.path.join('./logs/', 'training_log') df_log.to_csv(path_or_buf=output_path + '.csv', index=False) print('Saved the training_log.csv') # upload the log file log_dstore = Datastore(ws, args.datastore) log_dstore.upload_files(['./logs/training_log' + '.csv'], target_path='training_log_' + str(datetime.datetime.now().date()), overwrite=args.overwrite_logs, show_progress=True) print('Uploaded the training_log.csv')
def run(args): with open(args.run_spec_file, "r") as f: run_spec = yaml.load(f, Loader=yaml.SafeLoader) log = read_log_file() ws = get_workspace() experiment = Experiment(workspace=ws, name=log['aml_experiment_name']) experiments = log['experiments'] # Checking if experiment with same name already exists and cancelling and deleting it if needed if args.experiment_name in experiments: print("Experiment already exists. Please give a different name") exit(0) submitted_runs = [] all = True if args.j: all = False source_directory = tempfile.TemporaryDirectory() entry_script_file = "entry.py" with open(os.path.join(source_directory.name, entry_script_file), "w") as f: f.write(textwrap.dedent(entry_script_content).strip() + "\n") script_params = {} environment_variables = {} for x in run_spec['volumes']: if 'path' in x: script_params["--{}".format(x['name'])] = Datastore( workspace=ws, name=x['datastore']).path(x['path']).as_mount() environment_variables[x['name']] = str(script_params["--{}".format( x['name'])]) if x['name'] == 'OUTPUT_DIR': output_dir_datastore = x['datastore'] if 'environment_variables' in run_spec: for x in run_spec['environment_variables']: environment_variables[x['name']] = x['value'] setup_command = "" if 'setup' in run_spec: for x in run_spec['setup']: setup_command += x setup_command += '; ' compute_target = ComputeTarget(workspace=ws, name=run_spec['compute_name']) description = run_spec['description'] rtype = 'run' for run in run_spec['runs']: if all or run['name'] in args.j: script_params["--OUTPUT_DIR"] = Datastore( workspace=ws, name=output_dir_datastore).path("Experiments/{}/{}/{}".format( log['aml_experiment_name'], args.experiment_name, run['name'])).as_mount() environment_variables['OUTPUT_DIR'] = str( script_params["--OUTPUT_DIR"]) command = setup_command + run['command'] script_params['--command'] = command params = { 'use_gpu': True, 'custom_docker_image': run_spec['docker_image'], 'user_managed': True, 'source_directory': source_directory.name, 'entry_script': entry_script_file, 'script_params': script_params, 'environment_variables': environment_variables, 'compute_target': compute_target, } est = Estimator(**params) tags = { 'name': run['name'], 'experiment_name': args.experiment_name } submitted_run = experiment.submit(est, tags=tags) print("Submitting ", tags['name'], submitted_run.get_portal_url()) submitted_runs.append([submitted_run.id, run['name']]) log['experiments'][args.experiment_name] = { 'type': rtype, 'ids': submitted_runs, 'modified': datetime.datetime.now().timestamp(), 'output_dir_datastore': output_dir_datastore, 'description': description } write_log_file(log) source_directory.cleanup()
def main(): load_dotenv() workspace_name = os.environ.get("BASE_NAME") + "-AML-WS" resource_group = os.environ.get("BASE_NAME") + "-AML-RG" subscription_id = os.environ.get("SUBSCRIPTION_ID") tenant_id = os.environ.get("TENANT_ID") app_id = os.environ.get("SP_APP_ID") app_secret = os.environ.get("SP_APP_SECRET") sources_directory_train = os.environ.get("SOURCES_DIR_TRAIN") train_script_path = os.environ.get("TRAIN_SCRIPT_PATH") evaluate_script_path = os.environ.get("EVALUATE_SCRIPT_PATH") register_script_path = os.environ.get("REGISTER_SCRIPT_PATH") vm_size_cpu = os.environ.get("AML_COMPUTE_CLUSTER_CPU_SKU") compute_name_cpu = os.environ.get("AML_COMPUTE_CLUSTER_NAME") model_name = os.environ.get("MODEL_NAME") # Get Azure machine learning workspace aml_workspace = get_workspace(workspace_name, resource_group, subscription_id, tenant_id, app_id, app_secret) print(aml_workspace) # Get Azure machine learning cluster aml_compute_cpu = get_compute(aml_workspace, compute_name_cpu, vm_size_cpu) if aml_compute_cpu is not None: print(aml_compute_cpu) run_config = RunConfiguration(conda_dependencies=CondaDependencies.create( conda_packages=[ 'numpy', 'pandas', 'scikit-learn', 'tensorflow', 'keras' ])) run_config.environment.docker.enabled = True model_name = PipelineParameter(name="model_name", default_value=model_name) def_blob_store = Datastore(aml_workspace, "workspaceblobstore") jsonconfigs = PipelineData("jsonconfigs", datastore=def_blob_store) config_suffix = datetime.datetime.now().strftime("%Y%m%d%H") train_step = PythonScriptStep( name="Train Model", script_name=train_script_path, compute_target=aml_compute_cpu, source_directory=sources_directory_train, arguments=[ "--config_suffix", config_suffix, "--json_config", jsonconfigs, "--model_name", model_name, ], runconfig=run_config, # inputs=[jsonconfigs], outputs=[jsonconfigs], allow_reuse=False, ) print("Step Train created") evaluate_step = PythonScriptStep( name="Evaluate Model ", script_name=evaluate_script_path, compute_target=aml_compute_cpu, source_directory=sources_directory_train, arguments=[ "--config_suffix", config_suffix, "--json_config", jsonconfigs, ], runconfig=run_config, inputs=[jsonconfigs], # outputs=[jsonconfigs], allow_reuse=False, ) print("Step Evaluate created") register_model_step = PythonScriptStep( name="Register New Trained Model", script_name=register_script_path, compute_target=aml_compute_cpu, source_directory=sources_directory_train, arguments=[ "--config_suffix", config_suffix, "--json_config", jsonconfigs, "--model_name", model_name, ], runconfig=run_config, inputs=[jsonconfigs], # outputs=[jsonconfigs], allow_reuse=False, ) print("Step register model created") evaluate_step.run_after(train_step) register_model_step.run_after(evaluate_step) steps = [register_model_step] train_pipeline = Pipeline(workspace=aml_workspace, steps=steps) train_pipeline.validate() published_pipeline = train_pipeline.publish( name="training-pipeline", description="Model training/retraining pipeline") train_pipeline_json = {} train_pipeline_json["rest_endpoint"] = published_pipeline.endpoint json_file_path = "ml_service/pipelines/train_pipeline.json" with open(json_file_path, "w") as outfile: json.dump(train_pipeline_json, outfile)
# #### Create a Run Configuration # # In[ ]: # Create run config runconfig = RunConfiguration() runconfig.target = batchai_cluster_name runconfig.batchai.node_count = 2 runconfig.environment.docker.enabled = True # Set the datastore config in the runconfig _default_datastore = Datastore(ws) data_ref_configs = {} data_ref = _default_datastore._get_data_reference() data_ref_configs[data_ref.data_reference_name] = data_ref._to_config() runconfig.data_references = data_ref_configs; # #### Run an experiment # # In[ ]: # Set AMLBatchAI as the compute backend compute_strategy_batchai = AMLBatchAICompute(ws, runconfig) grid_cv_rf.compute_strategy = compute_strategy_batchai
import os import azureml.core from azureml.core import Workspace, Datastore from azureml.pipeline.steps import PythonScriptStep from azureml.pipeline.core import Pipeline, PipelineData from azureml.core import Experiment, Environment, ScriptRunConfig, RunConfiguration from azureml.core.conda_dependencies import CondaDependencies from azureml.contrib.pipeline.steps import ParallelRunConfig, ParallelRunStep from azureml.core.runconfig import DEFAULT_GPU_IMAGE ws = Workspace.from_config() def_blob_store = Datastore(ws, 'workspaceblobstore') output_data = PipelineData('output_data', datastore=def_blob_store, output_name='output_data', is_directory=True) batch_input = output_data.as_dataset() classification_data = PipelineData('classification_data', datastore=def_blob_store, output_name='classification_data', is_directory=True) compute_target = ws.compute_targets['cpu-cluster'] environment_variables = { 'POSTGRES_PASSWORD': os.environ['POSTGRES_PASSWORD'], 'POSTGRES_HOSTNAME': 'ackbar-postgres.postgres.database.azure.com',
def main(): e = Env() # Get Azure machine learning workspace aml_workspace = Workspace.get( name=e.workspace_name, subscription_id=e.subscription_id, resource_group=e.resource_group, ) print(f"get_workspace:{aml_workspace}") # Get Azure machine learning cluster aml_compute = get_compute(aml_workspace, e.compute_name, e.vm_size) if aml_compute is not None: print(f"aml_compute:{aml_compute}") # Create a reusable Azure ML environment environment = get_environment( aml_workspace, e.aml_env_name, create_new=e.rebuild_env, enable_docker=True, dockerfile='ml_model/preprocess/Dockerfile' ) # run_config = RunConfiguration() run_config.environment = environment if e.datastore_name: datastore_name = e.datastore_name else: datastore_name = aml_workspace.get_default_datastore().name run_config.environment.environment_variables["DATASTORE_NAME"] = datastore_name # NOQA: E501 datastore = Datastore(aml_workspace, name=datastore_name) data_file_path_param = PipelineParameter(name="data_file_path", default_value=e.dataset_name) # NOQA: E501 # The version of the input/output dataset can't be determined at pipeline publish time, only run time. # NOQA: E501 # Options to store output data: # Option 1: Use blob API to write output data. Otherwise, no way to dynamically change the output dataset based on PipelineParameter, # NOQA: E501 # The following will not work. It generate a path like "PipelineParameter_Name:data_file_path_Default:gear_images" # NOQA: E501 # output_ds = OutputFileDatasetConfig(destination=(datastore, data_file_path_param)) # NOQA: E501 # This option means writing a file locally and upload to the datastore. Fewer dataset, more code. # NOQA: E501 # Option 2: Use a dynamic path in OutputFileDatasetConfig, and register a new dataset at completion # NOQA: E501 # Output dataset can be mounted, so more dataset to maintain, less code. # NOQA: E501 # Using Option 2 below. output_dataset = OutputFileDatasetConfig( name=e.processed_dataset_name, destination=(datastore, "/dataset/{output-name}/{run-id}") ).register_on_complete( name=e.processed_dataset_name) preprocess_step = PythonScriptStep( name="Preprocess Data with OS cmd", script_name='preprocess/preprocess_os_cmd_aml.py', compute_target=aml_compute, source_directory=e.sources_directory_train, arguments=[ "--dataset_name", e.dataset_name, "--datastore_name", datastore_name, "--data_file_path", data_file_path_param, "--output_dataset", output_dataset, ], runconfig=run_config, allow_reuse=False, ) print("Step Preprocess OS cmd created") steps = [preprocess_step] preprocess_pipeline = Pipeline(workspace=aml_workspace, steps=steps) preprocess_pipeline._set_experiment_name preprocess_pipeline.validate() published_pipeline = preprocess_pipeline.publish( name=e.preprocessing_pipeline_name, description="Data preprocessing OS cmd pipeline", version=e.build_id, ) print(f"Published pipeline: {published_pipeline.name}") print(f"for build {published_pipeline.version}")
def main(): train_file = r"EdwardFry_Microsoft_issueDataset.csv" ws = Workspace.from_config() # Default datastore def_data_store = ws.get_default_datastore() # Loads config.json # Get the blob storage associated with the workspace def_blob_store = Datastore(ws, "workspaceblobstore") # Get file storage associated with the workspace def_file_store = Datastore(ws, "workspacefilestore") # Set data input and output xyz_phishing_dataset = Dataset.File.from_files([(def_blob_store, train_file)]) output_data1 = OutputFileDatasetConfig( destination=(datastore, 'outputdataset/{run-id}')) output_data_dataset = output_data1.register_on_complete( name='prepared_output_data') # Set compute compute_name = "aml-compute" vm_size = "STANDARD_NC6" if compute_name in ws.compute_targets: compute_target = ws.compute_targets[compute_name] if compute_target and type(compute_target) is AmlCompute: print('Found compute target: ' + compute_name) else: print('Creating a new compute target...') provisioning_config = AmlCompute.provisioning_configuration( vm_size=vm_size, # STANDARD_NC6 is GPU-enabled min_nodes=0, max_nodes=4) # create the compute target compute_target = ComputeTarget.create(ws, compute_name, provisioning_config) # Can poll for a minimum number of nodes and for a specific timeout. # If no min node count is provided it will use the scale settings for the cluster compute_target.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20) # For a more detailed view of current cluster status, use the 'status' property print(compute_target.status.serialize()) aml_run_config = RunConfiguration() # `compute_target` as defined in "Azure Machine Learning compute" section above aml_run_config.target = compute_target USE_CURATED_ENV = True if USE_CURATED_ENV: curated_environment = Environment.get(workspace=ws, name="AzureML-Tutorial") aml_run_config.environment = curated_environment else: aml_run_config.environment.python.user_managed_dependencies = False # Add some packages relied on by data prep step aml_run_config.environment.python.conda_dependencies = CondaDependencies.create( conda_packages=['pandas', 'scikit-learn'], pip_packages=['azureml-sdk', 'azureml-dataprep[fuse,pandas]'], pin_sdk_version=False) dataprep_source_dir = "./dataprep_src" entry_point = "prepare.py" # `my_dataset` as defined above ds_input = xyz_phishing_dataset.as_named_input('input1') # `output_data1`, `compute_target`, `aml_run_config` as defined above data_prep_step = PythonScriptStep(script_name=entry_point, source_directory=dataprep_source_dir, arguments=[ "--input", ds_input.as_download(), "--output", output_data1 ], compute_target=compute_target, runconfig=aml_run_config, allow_reuse=True) train_source_dir = "./train_src" train_entry_point = "train.py" training_results = OutputFileDatasetConfig(name="training_results", destination=def_blob_store) train_step = PythonScriptStep(script_name=train_entry_point, source_directory=train_source_dir, arguments=[ "--prepped_data", output_data1.as_input(), "--training_results", training_results ], compute_target=compute_target, runconfig=aml_run_config, allow_reuse=True) # list of steps to run (`compare_step` definition not shown) compare_models = [data_prep_step, train_step, compare_step] # Build the pipeline pipeline1 = Pipeline(workspace=ws, steps=[compare_models]) #dataset_consuming_step = PythonScriptStep( # script_name="iris_train.py", # inputs=[iris_tabular_dataset.as_named_input("iris_data")], # compute_target=compute_target, # source_directory=project_folder #) #run_context = Run.get_context() #iris_dataset = run_context.input_datasets['iris_data'] #dataframe = iris_dataset.to_pandas_dataframe() ## Within a PythonScriptStep #ws = Run.get_context().experiment.workspace #step = PythonScriptStep(name="Hello World", # script_name="hello_world.py", # compute_target=aml_compute, # source_directory=source_directory, # allow_reuse=False, # hash_paths=['hello_world.ipynb']) # Submit the pipeline to be run pipeline_run1 = Experiment(ws, 'Compare_Models_Exp').submit(pipeline1) pipeline_run1.wait_for_completion()
# use regular expressions to determine if the string is a decimal import re regex = r'^[+-]{0,1}((\d*\.)|\d*)\d+$' # get the train size and calculate out the test size if re.match(regex, train_size) is None: raise Exception("Please provide a decimal value as a string") if Decimal(train_size) > 0.85: raise Exception('Training size cannot be equal to or larger than 0.85') train_size = Decimal(train_size) test_size = Decimal("1.00") - Decimal(train_size) # get the datastore and the tabular dataset datastore = Datastore(ws, datastore_name) path_on_datastore = os.path.join(folder_name, file_name) dataset = Dataset.Tabular.from_delimited_files(path=(datastore, path_on_datastore)) # convert to pandas to split the data data = dataset.to_pandas_dataframe() X = data.drop(columns=label_name) y = data[label_name] # split the data using scikit-learn X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=float(test_size), random_state=101)
import azureml.core from azureml.core import Workspace from azureml.core import Datastore from azureml.core import Experiment from azureml.core.compute import ComputeTarget from azureml.core.runconfig import ContainerRegistry from azureml.train.estimator import Estimator from azureml.widgets import RunDetails # # Get the workspace, compute target, and datastore we prepared previously. # ws = Workspace.from_config() ct = ComputeTarget(workspace=ws, name="cpucluster-II") ds = Datastore(workspace=ws, name="hellotfstore") # # Create an estimator. # # Single node est_1 = Estimator( compute_target=compute_target, use_gpu=False, node_count=1, pip_packages=['tensorflow==1.13.1'], source_directory="../", entry_script="mnist-mlp.py", script_params={"--data-dir": ds.path("data/mnist").as_mount()})
def main(): # Parse command line arguments args = parse_args(sys.argv[1:]) # Retreive workspace workspace = Workspace.get( subscription_id=args.subscription_id, resource_group=args.resource_group, name=args.workspace_name, ) # Retreive compute cluster compute_target = workspace.compute_targets[args.compute_target] # Get baseline dataset baseline_dataset = Dataset.get_by_name(workspace, args.baseline_dataset_name) # Get model id and version model_name, model_version = args.model_id.split(":") # Get AKS Endpoint aks_endpoint = AksWebservice(workspace, args.endpoint_name) # Make call to endpoint with sample data and wait for the data to arrive in the storage account # [Note: this step is required to ensure a data sample is present for validation when # registering a new target dataset below - this can take up to 10 mins to appear] input_record = ( baseline_dataset.take(1) .to_pandas_dataframe() .drop(["cardiovascular_disease", "datetime"], axis=1) .to_dict("records") ) input_data = json.dumps({"data": input_record}) print("Variable [input_data]:", input_data) aks_endpoint.run(input_data) time.sleep(600) # Define target dataset target_dataset_name = ( f"inference-data-{model_name}-{model_version}-{args.endpoint_name}" ) # Get current registered target dataset definition current_target_dataset = Dataset.get_by_name(workspace, name=target_dataset_name) current_target_dataset_definition = json.loads(current_target_dataset._definition) # Get current registered target dataset datasetore definition current_target_dataset_datastore_definition = current_target_dataset_definition[ "blocks" ][0]["arguments"]["datastores"][0] # Define current registered target dataset datasetore target_dataset_datastore = Datastore( workspace, current_target_dataset_datastore_definition["datastoreName"] ) # Define current registered target dataset datasetore path target_dataset_datastore_path = current_target_dataset_datastore_definition["path"] # Create updated target dataset with non-string feature data types target_dataset = Dataset.Tabular.from_delimited_files( path=(target_dataset_datastore, target_dataset_datastore_path), validate=False, infer_column_types=False, set_column_types={ "age": DataType.to_float(decimal_mark="."), "height": DataType.to_float(decimal_mark="."), "weight": DataType.to_float(decimal_mark="."), "systolic": DataType.to_float(decimal_mark="."), "diastolic": DataType.to_float(decimal_mark="."), "gender": DataType.to_string(), "cholesterol": DataType.to_string(), "glucose": DataType.to_string(), "smoker": DataType.to_string(), "alcoholic": DataType.to_string(), "active": DataType.to_string(), "datetime": DataType.to_datetime(), }, ) # Assign timestamp column for Tabular Dataset to activate time series related APIs target_dataset = target_dataset.with_timestamp_columns( timestamp=target_dataset_timestamp_column ) # Register updated dataset version target_dataset.register( workspace, name=target_dataset_name, create_new_version=True ) print("Variable [target_dataset]:", target_dataset) print("Variable [baseline_dataset]:", baseline_dataset) # Define features to monitor feature_list = args.feature_list.split(",") print("Variable [feature_list]:", args.feature_list) # List data drift detectors drift_detector_list = DataDriftDetector.list(workspace) # Delete existing data drift detector for drift_monitor in drift_detector_list: if drift_monitor.name == args.data_drift_monitor_name: print("Deleteing existing data drift monitor...") drift_monitor.delete() # Define data drift detector monitor = DataDriftDetector.create_from_datasets( workspace, args.data_drift_monitor_name, baseline_dataset, target_dataset, compute_target=compute_target, frequency=args.frequency, feature_list=feature_list, ) print("Variable [monitor]:", monitor) # Enable the pipeline schedule for the data drift detector monitor.enable_schedule()
daily = ScheduleRecurrence(frequency = 'Day', interval = 1) pipeline_schedule = Schedule.create(ws, name = 'Daily training', description='trains model every day', pipeline_id = published_pipeline.id) experiment_name = 'Training_pipeline', recurrence = daily) # we can also create it like, whenevr data will change pipeline will be run from azureml.core import Datastore from azureml.pipeline.core import Schedule training_ds = Datastore(workspace = ws, name = "blob_data") pipeline_schedule = Schedule.create(ws, name = 'Reactive training', description='trains model on data change', pipeline_id = published_pipeline_id, experiment_name = 'training_pipeline', datastore=training_ds, path_on_datastore='data/training')
def create_experiment_config(workspace): ######################################## ### Creating data prep Pipeline Step ### ######################################## # Load settings print("Loading settings") data_prep_step_path = os.path.join("steps", "data_prep") with open(os.path.join(data_prep_step_path, "step.json")) as f: data_prep_settings = json.load(f) # Setup datasets of first step print("Setting up datasets") data_prep_input = Dataset.get_by_name(workspace=workspace, name=data_prep_settings.get( "dataset_input_name", None)).as_named_input( data_prep_settings.get( "dataset_input_name", None)).as_mount() data_prep_output = PipelineData( name=data_prep_settings.get("dataset_output_name", None), datastore=Datastore(workspace=workspace, name=data_prep_settings.get( "datastore_output_name", "workspaceblobstore")), output_mode="mount").as_dataset() # Uncomment next lines, if you want to register intermediate dataset #data_prep_output.register( # name=data_prep_settings.get("dataset_output_name", None), # create_new_version=True #) # Create conda dependencies print("Creating conda dependencies") data_prep_dependencies = CondaDependencies.create( pip_packages=data_prep_settings.get("pip_packages", []), conda_packages=data_prep_settings.get("conda_packages", []), python_version=data_prep_settings.get("python_version", "3.6.2")) # Create run configuration print("Creating RunConfiguration") data_prep_run_config = RunConfiguration( conda_dependencies=data_prep_dependencies, framework=data_prep_settings.get("framework", "Python")) # Loading compute target print("Loading ComputeTarget") data_prep_compute_target = ComputeTarget(workspace=workspace, name=data_prep_settings.get( "compute_target_name", None)) # Create python step print("Creating Step") data_prep = PythonScriptStep( name=data_prep_settings.get("step_name", None), script_name=data_prep_settings.get("script_name", None), arguments=data_prep_settings.get("arguments", []), compute_target=data_prep_compute_target, runconfig=data_prep_run_config, inputs=[data_prep_input], outputs=[data_prep_output], params=data_prep_settings.get("parameters", []), source_directory=data_prep_step_path, allow_reuse=data_prep_settings.get("allow_reuse", True), version=data_prep_settings.get("version", None), ) ############################################### ### Creating data model train Pipeline Step ### ############################################### # Load settings print("Loading settings") model_train_step_path = os.path.join("steps", "model_train") with open(os.path.join(model_train_step_path, "step.json")) as f: model_train_settings = json.load(f) hyperparameter_sampling_settings = model_train_settings.get( "hyperparameter_sampling", {}) # Setup datasets of first step print("Setting up datasets") model_train_input = data_prep_output.as_named_input( name=model_train_settings.get("dataset_input_name", None)) model_train_output = PipelineData( name=model_train_settings.get("dataset_output_name", None), datastore=Datastore(workspace=workspace, name=model_train_settings.get( "datastore_output_name", None)), output_mode="mount", ).as_dataset() # Uncomment next lines, if you want to register intermediate dataset #model_train_output.register( # name=model_train_settings.get("dataset_output_name", None), # create_new_version=True #) # Create conda dependencies print("Creating conda dependencies") model_train_dependencies = CondaDependencies.create( pip_packages=model_train_settings.get("pip_packages", []), conda_packages=model_train_settings.get("conda_packages", []), python_version=model_train_settings.get("python_version", "3.6.2")) # Create run configuration print("Creating RunConfiguration") model_train_run_config = RunConfiguration( conda_dependencies=model_train_dependencies, framework=model_train_settings.get("framework", "Python")) # Loading compute target print("Loading ComputeTarget") model_train_compute_target = ComputeTarget(workspace=workspace, name=model_train_settings.get( "compute_target_name", None)) # Create distributed training backend print("Creating distributed training backend") distributed_training_backend = get_distributed_backend( backend_name=model_train_settings.get("distributed_backend", None)) # Create Estimator for Training print("Creating Estimator for training") model_train_estimator = Estimator( source_directory=model_train_step_path, entry_script=model_train_settings.get("script_name", None), environment_variables=model_train_settings.get("parameters", None), compute_target=model_train_compute_target, node_count=model_train_settings.get("node_count", None), distributed_training=distributed_training_backend, conda_packages=model_train_settings.get("conda_packages", None), pip_packages=model_train_settings.get("pip_packages", None), ) try: # Create parameter sampling print("Creating Parameter Sampling") parameter_dict = {} parameters = hyperparameter_sampling_settings.get( "parameters", {}) if "parameters" in hyperparameter_sampling_settings else {} for parameter_name, parameter_details in parameters.items(): parameter_distr = get_parameter_distribution( distribution=parameter_details.get("distribution", None), **parameter_details.get("settings", {})) parameter_dict[f"--{parameter_name}"] = parameter_distr model_train_ps = get_parameter_sampling( sampling_method=hyperparameter_sampling_settings.get( "method", None), parameter_dict=parameter_dict) # Get Policy definition policy_settings = hyperparameter_sampling_settings.get("policy", {}) kwargs = { key: value for key, value in policy_settings.items() if key not in ["policy_method", "evaluation_interval", "delay_evaluation"] } # Create termination policy print("Creating early termination policy") model_train_policy = get_policy( policy_method=policy_settings.get("method", ""), evaluation_interval=policy_settings.get("evaluation_interval", None), delay_evaluation=policy_settings.get("delay_evaluation", None), **kwargs) # Create HyperDriveConfig print("Creating HyperDriveConfig") model_train_hyperdrive_config = HyperDriveConfig( estimator=model_train_estimator, hyperparameter_sampling=model_train_ps, policy=model_train_policy, primary_metric_name=hyperparameter_sampling_settings.get( "primary_metric", None), primary_metric_goal=PrimaryMetricGoal.MINIMIZE if "min" in hyperparameter_sampling_settings.get( "primary_metric_goal", None) else PrimaryMetricGoal.MAXIMIZE, max_total_runs=hyperparameter_sampling_settings.get( "max_total_runs", 1), max_concurrent_runs=hyperparameter_sampling_settings.get( "max_concurrent_runs", 1), max_duration_minutes=hyperparameter_sampling_settings.get( "max_duration_minutes", None)) # Create HyperDriveStep print("Creating HyperDriveStep") model_train = HyperDriveStep( name=model_train_settings.get("step_name", None), hyperdrive_config=model_train_hyperdrive_config, estimator_entry_script_arguments=model_train_settings.get( "arguments", None), inputs=[model_train_input], outputs=[model_train_output], allow_reuse=model_train_settings.get("allow_reuse", True), version=model_train_settings.get("version", True)) except: print("Not all required parameters specified for HyperDrive step") # Create EstimatorStep print("Creating EstimatorStep") model_train = EstimatorStep( name=model_train_settings.get("step_name", None), estimator=model_train_estimator, estimator_entry_script_arguments=model_train_settings.get( "arguments", None), inputs=[model_train_input], outputs=[model_train_output], compute_target=model_train_compute_target, allow_reuse=model_train_settings.get("allow_reuse", True), version=model_train_settings.get("version", True)) ######################### ### Creating Pipeline ### ######################### # Create Pipeline print("Creating Pipeline") pipeline = Pipeline( workspace=workspace, steps=[model_train], description="Training Pipeline", ) # Validate pipeline print("Validating pipeline") pipeline.validate() return pipeline
from azureml.core import Workspace, Datastore, Dataset ws = Workspace.from_config() ds = Datastore(ws, "mydatastore") #creating Dataset #creating path, we can have multiple data path like this dataset_path = [(ds, "loan.csv")] loan_dataset = Dataset.Tabular.from_delimited_files(path = dataset_path) dataset = loan_dataset.register(workspace=ws, name = "Loan Application") # see all the datasets for i in list(ws.datasets.keys()): print(i) # get a dataset df = Dataset.get_by_name(ws, "Loan Application")
# Note: source_directory and entry_script are in local, source_directory/entry_script source_directory = "./" # print(sys.argv[1]) # entry_script = sys.argv[1] entry_script = 'run.py' # entry_script = "./entry-script.py" # subscription_id = config.subscription_id # resource_group = config.resource_group # workspace_name = config.workspace_name ws = Workspace(subscription_id = subscription_id, resource_group = resource_group, workspace_name = workspace_name) # cluster_name= config.cluster_name ct = ComputeTarget(workspace=ws, name=cluster_name) # datastore_name =config.datastore_name ds = Datastore(workspace=ws, name=datastore_name) workdir = os.path.realpath('.')[os.path.realpath('.').find('FixMatch-pytorch'):] workdir = workdir.replace('\\', '/') script_params = { "--workdir": ds.path('/projects/'+workdir).as_mount(), # REQUIRED !!! "--cxk_volna": ds.path('/').as_mount(), "--exp_name": workdir.split('/')[-1], } def make_container_registry(address, username, password): cr = ContainerRegistry() cr.address = address cr.username = username cr.password = password
def load(quick_run, data_path, cache_path, model_name, num_gpus, random_seed): # Set QUICK_RUN = True to run the notebook on a small subset of data and a smaller number of epochs. QUICK_RUN = quick_run # Wikigold dataset DATA_URL = ( "https://raw.githubusercontent.com/juand-r/entity-recognition-datasets" "/master/data/wikigold/CONLL-format/data/wikigold.conll.txt" ) # fraction of the dataset used for testing TEST_DATA_FRACTION = 0.3 # sub-sampling ratio SAMPLE_RATIO = 1 # the data path used to save the downloaded data file DATA_PATH = data_path # the cache data path during find tuning CACHE_DIR = cache_path if not os.path.exists(os.path.dirname(DATA_PATH)): os.mkdir(os.path.dirname(DATA_PATH)) if not os.path.exists(DATA_PATH): os.mkdir(DATA_PATH) if not os.path.exists(CACHE_DIR): os.mkdir(CACHE_DIR) # set random seeds RANDOM_SEED = random_seed torch.manual_seed(RANDOM_SEED) MODEL_NAME = model_name # MODEL_NAME = "distilbert" DO_LOWER_CASE = False MAX_SEQ_LENGTH = 200 TRAILING_PIECE_TAG = "X" NUM_GPUS = num_gpus BATCH_SIZE = 16 # update variables for quick run option if QUICK_RUN: SAMPLE_RATIO = 0.1 NUM_TRAIN_EPOCHS = 1 # download data file_name = DATA_URL.split("/")[-1] # a name for the downloaded file maybe_download(DATA_URL, file_name, DATA_PATH) data_file = os.path.join(DATA_PATH, file_name) # parse CoNll file sentence_list, labels_list = read_conll_file(data_file, sep=" ", encoding='utf-8') # sub-sample (optional) random.seed(RANDOM_SEED) sample_size = int(SAMPLE_RATIO * len(sentence_list)) sentence_list, labels_list = list( zip(*random.sample(list(zip(sentence_list, labels_list)), k=sample_size)) ) # train-test split train_sentence_list, test_sentence_list, train_labels_list, test_labels_list = train_test_split( sentence_list, labels_list, test_size=TEST_DATA_FRACTION, random_state=RANDOM_SEED ) processor = TokenClassificationProcessor(model_name=MODEL_NAME, to_lower=DO_LOWER_CASE, cache_dir=CACHE_DIR) label_map = TokenClassificationProcessor.create_label_map( label_lists=labels_list, trailing_piece_tag=TRAILING_PIECE_TAG ) train_dataset = processor.preprocess( text=train_sentence_list, max_len=MAX_SEQ_LENGTH, labels=train_labels_list, label_map=label_map, trailing_piece_tag=TRAILING_PIECE_TAG, ) # train_data_loader = DataLoader(train_dataset) test_dataset = processor.preprocess( text=test_sentence_list, max_len=MAX_SEQ_LENGTH, labels=test_labels_list, label_map=label_map, trailing_piece_tag=TRAILING_PIECE_TAG, ) torch.save(train_dataset, os.path.join(DATA_PATH, 'train.pt')) torch.save(test_dataset, os.path.join(DATA_PATH, 'test.pt')) torch.save(label_map, os.path.join(DATA_PATH, 'label_map.pt')) # Default datastore def_data_store = ws.get_default_datastore() # Get the blob storage associated with the workspace def_blob_store = Datastore(ws, "workspaceblobstore") # Get file storage associated with the workspace def_file_store = Datastore(ws, "workspacefilestore") try: def_blob_store.upload_files( [os.path.join(DATA_PATH, 'train.pt')], target_path="nerdata", overwrite=True, show_progress=True) except Exception as e: print(f"Failed to upload -> {e}") try: def_blob_store.upload_files( [os.path.join(DATA_PATH, 'test.pt')], target_path="nerdata", overwrite=True, show_progress=True) except Exception as e: print(f"Failed to upload -> {e}") try: def_blob_store.upload_files( [os.path.join(DATA_PATH, 'label_map.pt')], target_path="nerdata", overwrite=True, show_progress=True) except Exception as e: print(f"Failed to upload -> {e}") train_datastore_paths = [(def_blob_store, 'nerdata/train.pt')] test_datastore_paths = [(def_blob_store, 'nerdata/test.pt')] label_map_datastore_paths = [(def_blob_store, 'nerdata/label_map.pt')] # def_blob_store.upload(src_dir=DATA_PATH, target_path="nerdata", overwrite=True, show_progress=True) train_ds = Dataset.File.from_files(path=train_datastore_paths) test_ds = Dataset.File.from_files(path=test_datastore_paths) label_map_ds = Dataset.File.from_files(path=label_map_datastore_paths) train_ds = train_ds.register(workspace=ws, name='ner_bert_train_ds', description='Named Entity Recognition with BERT (Training set)', create_new_version=False) test_ds = test_ds.register(workspace=ws, name='ner_bert_test_ds', description='Named Entity Recognition with BERT (Testing set)', create_new_version=False) label_map_ds = label_map_ds.register(workspace=ws, name='ner_bert_label_map_ds_ds', description='Named Entity Recognition with BERT (Testing set)', create_new_version=False) train_dataloader = dataloader_from_dataset( train_dataset, batch_size=BATCH_SIZE, num_gpus=NUM_GPUS, shuffle=True, distributed=False ) test_dataloader = dataloader_from_dataset( test_dataset, batch_size=BATCH_SIZE, num_gpus=NUM_GPUS, shuffle=False, distributed=False ) return (train_dataloader, test_dataloader, label_map)
parser = argparse.ArgumentParser("Pipeline") parser.add_argument( "--pipeline_action", type=str, choices=["pipeline-test", "publish"], help="Determines if pipeline needs to run on small data set \ or pipeline needs to be republished", #default="pipeline-test", ) args = parser.parse_args() # Get workspace ws = Workspace.from_config(path="aml_config/config.json", auth=cli_auth) def_blob_store = Datastore(ws, "workspaceblobstore") # Get AML Compute name and Experiment Name with open("aml_config/security_config.json") as f: config = json.load(f) experiment_name = config["experiment_name"] aml_cluster_name = config["aml_cluster_name"] aml_pipeline_name = "training-pipeline" source_directory = "code" # Run Config # Declare packages dependencies required in the pipeline (these can also be expressed as a YML file) # cd = CondaDependencies.create(pip_packages=["azureml-defaults", 'tensorflow==1.8.0']) cd = CondaDependencies("aml_config/conda_dependencies.yml")
# In[ ]: # Module select_columns_in_dataset = Module.load(ws, namespace='azureml', name='Select Columns in Dataset') clean_missing_data = Module.load(ws, namespace='azureml', name='Clean Missing Data') split_data = Module.load(ws, namespace='azureml', name='Split Data') join_data = Module.load(ws, namespace='azureml', name='Join Data') # Dataset try: dset = Dataset.get_by_name(ws, 'Automobile_price_data_(Raw)') except Exception: global_datastore = Datastore(ws, name="azureml_globaldatasets") dset = Dataset.File.from_files(global_datastore.path('GenericCSV/Automobile_price_data_(Raw)')) dset.register(workspace=ws, name='Automobile_price_data_(Raw)', create_new_version=True) blob_input_data = dset # In[ ]: # sub pipeline: TODO improve this experience @dsl.pipeline(name='sub sub', description='sub') def sub_sub_pipeline(minimum_missing_value_ratio): module1 = select_columns_in_dataset( dataset=blob_input_data,
def _get_data_from_dataprep(dataprep_json, automl_settings_obj, logger): current_run = Run.get_submitted_run() parent_run_id = _get_parent_run_id(current_run._run_id) print("[ParentRunId:{}]: Start getting data using dataprep.".format(parent_run_id)) logger.info("[ParentRunId:{}]: Start getting data using dataprep.".format(parent_run_id)) try: import azureml.train.automl._dataprep_utilities as dataprep_utilities except Exception as e: e.error_type = ErrorTypes.Unclassified log_traceback(e, logger) logger.error(e) raise e fit_iteration_parameters_dict = dict() class RetrieveNumpyArrayError(Exception): def __init__(self): super().__init__() try: print("Resolving Dataflows...") logger.info("Resolving Dataflows...") dataprep_json_obj = json.loads(dataprep_json) if 'activities' in dataprep_json_obj: # json is serialized dataflows dataflow_dict = dataprep_utilities.load_dataflows_from_json( dataprep_json) for k in ['X', 'X_valid', 'sample_weight', 'sample_weight_valid']: fit_iteration_parameters_dict[k] = dataprep_utilities.try_retrieve_pandas_dataframe(dataflow_dict.get(k)) for k in ['y', 'y_valid']: try: fit_iteration_parameters_dict[k] = dataprep_utilities.try_retrieve_numpy_array(dataflow_dict.get(k)) except IndexError: raise RetrieveNumpyArrayError() cv_splits_dataflows = [] i = 0 while 'cv_splits_indices_{0}'.format(i) in dataflow_dict: cv_splits_dataflows.append( dataflow_dict['cv_splits_indices_{0}'.format(i)]) i = i + 1 fit_iteration_parameters_dict['cv_splits_indices'] = None if len(cv_splits_dataflows) == 0 \ else dataprep_utilities.try_resolve_cv_splits_indices(cv_splits_dataflows) else: # json is dataprep options print('Creating Dataflow from options...\r\nOptions:') logger.info('Creating Dataflow from options...') print(dataprep_json_obj) datastore_name = dataprep_json_obj['datastoreName'] # mandatory data_path = dataprep_json_obj['dataPath'] # mandatory label_column = dataprep_json_obj['label'] # mandatory separator = dataprep_json_obj.get('columnSeparator', ',') header = dataprep_json_obj.get('promoteHeader', True) encoding = dataprep_json_obj.get('encoding', None) quoting = dataprep_json_obj.get('ignoreNewlineInQuotes', False) skip_rows = dataprep_json_obj.get('skipRows', 0) feature_columns = dataprep_json_obj.get('features', []) from azureml.core import Datastore import azureml.dataprep as dprep if header: header = dprep.PromoteHeadersMode.CONSTANTGROUPED else: header = dprep.PromoteHeadersMode.NONE try: encoding = dprep.FileEncoding[encoding] except: encoding = dprep.FileEncoding.UTF8 ws = Run.get_context().experiment.workspace datastore = Datastore(ws, datastore_name) dflow = dprep.read_csv(path=datastore.path(data_path), separator=separator, header=header, encoding=encoding, quoting=quoting, skip_rows=skip_rows) if len(feature_columns) == 0: X = dflow.drop_columns(label_column) else: X = dflow.keep_columns(feature_columns) print('Inferring types for feature columns...') logger.info('Inferring types for feature columns...') sct = X.builders.set_column_types() sct.learn() sct.ambiguous_date_conversions_drop() X = sct.to_dataflow() y = dflow.keep_columns(label_column) if automl_settings_obj.task_type.lower() == 'regression': y = y.to_number(label_column) print('X:') print(X) logger.info('X:') logger.info(X) print('y:') print(y) logger.info('y:') logger.info(y) try: from azureml.train.automl._dataprep_utilities import try_retrieve_pandas_dataframe_adb _X = try_retrieve_pandas_dataframe_adb(X) fit_iteration_parameters_dict['X'] = _X.values fit_iteration_parameters_dict['x_raw_column_names'] = _X.columns.values except ImportError: logger.info("SDK version does not support column names extraction, fallback to old path") fit_iteration_parameters_dict['X'] = dataprep_utilities.try_retrieve_pandas_dataframe(X) try: fit_iteration_parameters_dict['y'] = dataprep_utilities.try_retrieve_numpy_array(y) except IndexError: raise RetrieveNumpyArrayError() logger.info("Finish getting data using dataprep.") return fit_iteration_parameters_dict except Exception as e: print("[ParentRunId:{0}]: Error from resolving Dataflows: {1} {2}".format(parent_run_id, e.__class__, e)) logger.error("[ParentRunId:{0}]: Error from resolving Dataflows: {1} {2}".format(parent_run_id, e.__class__, e)) if isinstance(e, RetrieveNumpyArrayError): logger.debug("Label column (y) does not exist in user's data.") e.error_type = ErrorTypes.User elif "The provided path is not valid." in str(e): logger.debug("User's data is not accessible from remote run.") e.error_type = ErrorTypes.User elif "Required secrets are missing. Please call use_secrets to register the missing secrets." in str(e): logger.debug("User should use Datastore to data that requires secrets.") e.error_type = ErrorTypes.User else: e.error_type = ErrorTypes.Client log_traceback(e, logger) raise RuntimeError("Error during extracting Dataflows")
# In[121]: from azureml.core import Workspace, Datastore # In[122]: # Default datastore def_data_store = ws.get_default_datastore() # Get the blob storage associated with the workspace def_blob_store = Datastore(ws, "workspaceblobstore") # Get file storage associated with the workspace def_file_store = Datastore(ws, "workspacefilestore") # In[123]: def_blob_store.upload_files( ["Downloads/005930.KS.csv"], target_path="xyz", overwrite=True) # In[ ]: