def update_io(inputs, outputs): for key, value in inputs.items(): if isinstance(value, _Dataset): raise UserErrorException( "Dataset cannot be used without providing a name for the run. Please provide " "a name by calling the as_named_input instance method on dataset." ) elif isinstance(value, DatasetConsumptionConfig): value.dataset._ensure_saved(workspace) inputs[key] = Data.create(value) input_data.append(value) # Set the environment variable for mount validation if value.dataset._consume_latest: env_vars = run_config.environment.environment_variables if _SKIP_VALIDATE_DATASETS not in env_vars: env_vars[_SKIP_VALIDATE_DATASETS] = value.name else: env_vars[_SKIP_VALIDATE_DATASETS] = ",".join( [env_vars[_SKIP_VALIDATE_DATASETS], value.name]) elif isinstance(value, Data): input_data.append(value) else: raise UserErrorException("{} cannot be used as input.".format( type(value).__name__)) for key, value in outputs.items(): if isinstance(value, OutputDatasetConfig): outputs[key] = output_data[key] = value._to_output_data() elif isinstance(value, OutputData): output_data[key] = value else: raise UserErrorException("{} cannot be used as output.".format( type(value).__name__))
def _get_upload_from_files(file_paths, target_path, relative_root, skip_root_check): paths_to_upload = [] target_path = _sanitize_target_path(target_path) for file_path in file_paths: if not skip_root_check and relative_root not in file_path and relative_root != "/": raise UserErrorException( "relative_root: '{}' is not part of the file_path: '{}'.". format(relative_root, file_path)) if not os.path.isfile(file_path): err_msg = "'{}' does not point to a file. " + \ "Please upload the file to cloud first if running in a cloud notebook." raise UserErrorException(err_msg.format(file_path)) target_file_path = to_unix_path(file_path) if relative_root != "/": # need to do this because Windows doesn't support relpath if the partition is different target_file_path = os.path.relpath(target_file_path, to_unix_path(relative_root)) else: # strip away / otherwise we will create a folder in the container with no name target_file_path = target_file_path.lstrip("/") if target_path: target_file_path = os.path.join(target_path, target_file_path) paths_to_upload.append((file_path, target_file_path)) return paths_to_upload
def reactivate_dataset(workspace=None, dataset_name=None, dataset_id=None, logger=None): if _check_python() is False: raise UserErrorException( 'The dataset command subgroup is only supported with Python 3.5 or more' ) dataset = Dataset.get(workspace, dataset_name, dataset_id) dataset_state = dataset.state if dataset_state == 'active': raise UserErrorException("Dataset '{}' ({}) is already active".format( dataset.name, dataset.id)) dataset.reactivate() dataset = Dataset.get(workspace, name=dataset.name) if dataset.state == 'active': logger.info("Dataset '{}' ({}) was reactivated successfully".format( dataset.name, dataset.id)) return dataset._get_base_info_dict_show() else: logger.debug( "dataset reactivate error. name: {} id: {} state: {}".format( dataset.name, dataset.id, dataset.state)) raise Exception("Error, Dataset '{}' ({}) was not reactivated".format( dataset.name, dataset.id))
def _create(cls, definition, properties=None, registration=None, telemetry_info=None): if registration is not None and not isinstance(registration, _DatasetRegistration): raise UserErrorException( 'registration must be instance of `_DatasetRegistration`') if telemetry_info is not None and not isinstance( telemetry_info, _DatasetTelemetryInfo): raise UserErrorException( 'telemetry_info must be instance of `_DatasetTelemetryInfo`') dataset = cls() dataset._definition = definition # definition is either str or Dataflow which is immutable dataset._properties = deepcopy(properties) if properties else {} dataset._registration = registration dataset._telemetry_info = telemetry_info from azureml.data._partition_format import parse_partition_format steps = dataset._dataflow._get_steps() partition_keys = [] for step in steps: if step.step_type == 'Microsoft.DPrep.AddColumnsFromPartitionFormatBlock' and \ step.arguments['partitionFormat']: parsed_result = parse_partition_format( step.arguments['partitionFormat']) if len(parsed_result) == 3 and parsed_result[2]: partition_keys = parsed_result[2] break dataset._properties[_PARTITION_KEYS] = partition_keys return dataset
def save(self, path=None): """Save the conda dependencies object to file. :param path: The fully qualified path of the file you want to save to. :type path: str :return: The normalized conda path. :rtype: str :raises azureml.exceptions.UserErrorException: Raised for issues saving the dependencies. """ if os.path.isdir(path): raise UserErrorException("Cannot save a conda environment specification file to a directory. " "Please specify a fully qualified path along with the " "file name to save the file.") parent_dir = os.path.dirname(path) if parent_dir == "" or os.path.exists(parent_dir) and os.path.isdir(parent_dir): normalized_conda_path = normalize_windows_paths(path) else: raise UserErrorException( "Cannot save the conda environment specification file to an invalid path.") self._validate() with open(normalized_conda_path, 'w') as outfile: ruamel.yaml.round_trip_dump(self._conda_dependencies, outfile) return normalized_conda_path
def _validate_yaml(ruamel_yaml_object): if not isinstance(ruamel_yaml_object, dict): raise UserErrorException("Environment error: not a valid YAML structure") for key in ruamel_yaml_object.keys(): if not str(key) in CondaDependencies._VALID_YML_KEYS: msg = "Environment error: unknown {} key in environment specification".format(str(key)) raise UserErrorException(msg)
def get_partition_key_values(self, partition_keys=None): """Return unique key values of partition_keys. validate if partition_keys is a valid subset of full set of partition keys, return unique key values of partition_keys, default to return the unique key combinations by taking the full set of partition keys of this dataset if partition_keys is None .. code-block:: python # get all partition key value pairs partitions = ds.get_partition_key_values() # Return [{'country': 'US', 'state': 'WA', 'partition_date': datetime('2020-1-1')}] partitions = ds.get_partition_key_values(['country']) # Return [{'country': 'US'}] :param partition_keys: partition keys :type partition_keys: builtin.list[str] """ import time starting_time = time.process_time() if not self.partition_keys or len(self.partition_keys) == 0: raise UserErrorException( "get_partition_key_values is not available to a dataset that has no " "partition keys") if not partition_keys: partition_keys = self.partition_keys invalid_keys = [] for key in partition_keys: if key not in self.partition_keys: invalid_keys.append(key) if len(invalid_keys) != 0: raise UserErrorException( "{0} are invalid partition keys".format(invalid_keys)) dataflow = self._dataflow.keep_columns(partition_keys) for step in dataflow._steps: if step.step_type == 'Microsoft.DPrep.ReadParquetFileBlock' or \ step.step_type == 'Microsoft.DPrep.ParseDelimitedBlock' or \ step.step_type == 'Microsoft.DPrep.ParseJsonLinesBlock': dataflow._steps.remove(step) dataflow = dataflow.distinct_rows() pd = dataflow.to_pandas_dataframe() partition_key_values = pd[partition_keys].to_dict( orient='records') if pd.shape[0] != 0 else [] if self._registration and self._registration.workspace: collect_datasets_usage( _get_logger(), _PATITION_KEY_VALUES_ACTIVITY, [self], self._registration.workspace, "{}", { "execution_time": time.process_time() - starting_time, "number_of_partition_keys": len(partition_keys) }) return partition_key_values
def _validate_config(self, data_reference, key): from azureml.exceptions import UserErrorException if not data_reference.data_store_name: raise UserErrorException( "DataReference {} misses the datastore name".format(key)) if self._is_upload( data_reference) and not data_reference.path_on_compute: raise UserErrorException( "DataReference {} misses the relative path on the compute". format(key))
def _text_input(prompt_message, allow_empty=False): text_1 = input(prompt_message) if len(text_1) <= 0 and not allow_empty: raise UserErrorException("Empty value not allowed. Please try again.") text_2 = input("Re-enter the value for confirmation:") if text_1 == text_2: return text_1 else: raise UserErrorException( "Entered values don't match. Please try again.")
def _password_input(prompt_message, allow_empty=False): password_1 = getpass.getpass(prompt_message) if len(password_1) <= 0 and not allow_empty: raise UserErrorException( "Empty password not allowed. Please try again.") password_2 = getpass.getpass("Re-enter the password for confirmation:") if password_1 == password_2: return password_1 else: raise UserErrorException( "Entered passwords don't match. Please try again.")
def submit_pipeline( workspace=None, # Auto populated args + object pipeline_id=None, experiment_name=None, pipeline_yaml=None, pipeline_params=None, datapath_params=None, output_file=None, # We enforce a logger logger=None): """ Submit a pipeline run based on a published pipeline ID """ if pipeline_id is None and pipeline_yaml is None: raise UserErrorException("Please specify a pipeline ID or a pipeline YAML file") published_pipeline = None pipeline = None if pipeline_id is not None: from azureml.pipeline.core import PublishedPipeline published_pipeline = PublishedPipeline.get(workspace, pipeline_id) if experiment_name is None or experiment_name == '': # Use the pipeline name as the experiment name experiment_name = published_pipeline._sanitize_name() else: from azureml.pipeline.core import Pipeline pipeline = Pipeline.load_yaml(workspace, pipeline_yaml) if experiment_name is None: raise UserErrorException("Please specify an experiment name") assigned_params = _parse_key_values(pipeline_params, 'Parameter assignment') datapaths = _parse_key_values(datapath_params, 'Datapath assignment') for datapath_param_name in datapaths: datastore_with_path = datapaths[datapath_param_name] if '/' not in datastore_with_path: raise UserErrorException("Datapath value %s should have format datastore/path" % datastore_with_path) path_tokens = datastore_with_path.split('/', 1) from azureml.core import Datastore from azureml.data.datapath import DataPath datastore = Datastore(workspace, path_tokens[0]) assigned_params[datapath_param_name] = DataPath(datastore=datastore, path_on_datastore=path_tokens[1]) dict_output = _pipeline_run_submit(experiment_name, assigned_params, published_pipeline, pipeline, workspace, output_file, logger) return dict_output
def _validate_inputs(dataset, compute_target): if not isinstance(dataset, TabularDataset): raise UserErrorException( 'Invalid type. dataset should be of type ' 'azureml.data.tabular_dataset.TabularDataset but was found to be ' 'of type {0}.'.format(type(dataset))) if not (isinstance(compute_target, ComputeTarget) or isinstance(compute_target, str)): raise UserErrorException( 'Invalid type. compute_target should be either of type ComputeTarget or ' 'string but was found to be of type {0}.'.format( type(compute_target)))
def convert_seconds_to_duration(duration_in_seconds): """ Convert duration in seconds into ISO-8601 formatted seconds string. """ try: duration_in_seconds = int(duration_in_seconds) except: raise UserErrorException( 'Invalid input, provide an integer duration in seconds') if duration_in_seconds < 0: raise UserErrorException('Invalid input, provide duration in seconds') return "PT{}S".format(duration_in_seconds)
def add_pip_package(self, pip_package): r"""Add a pip package. .. note:: Adding a dependency of an already referenced package will remove the previous reference and add a new \ reference to the end of the dependencies list. This may change the order of the dependencies. :param pip_package: The pip package to be add. :type pip_package: str """ if self._is_option(pip_package): raise UserErrorException( "Invalid package name {}".format( pip_package )) self.remove_pip_package(pip_package) if not self._has_pip_package(): pip_obj = {PIP: [pip_package]} if PACKAGES in self._conda_dependencies: self._conda_dependencies[PACKAGES].append(pip_obj) else: self._conda_dependencies[PACKAGES] = [pip_obj] elif pip_package not in self.pip_packages: for pitem in self._conda_dependencies[PACKAGES]: if PIP in pitem and isinstance(pitem, dict): pitem[PIP].append(pip_package)
def set_pip_option(self, pip_option): """Add a pip option. :param pip_option: The pip option to add. :type pip_option: str """ if not self._is_option(pip_option): raise UserErrorException( "Invalid pip option {}".format( pip_option )) if not self._has_pip_package(): pip_obj = {PIP: [pip_option]} if PACKAGES in self._conda_dependencies: self._conda_dependencies[PACKAGES].append(pip_obj) else: self._conda_dependencies[PACKAGES] = [pip_obj] else: options = [x.split()[0] for x in self.pip_options] option_to_add = pip_option.split()[0] for pitem in self._conda_dependencies[PACKAGES]: if PIP in pitem and isinstance(pitem, dict): if option_to_add not in options: pitem[PIP].append(pip_option) else: for i in range(len(pitem[PIP])): if pitem[PIP][i].split()[0] == option_to_add: pitem[PIP][i] = pip_option
def upload_dir(self, dir_path, origin, container, path_to_name_fn=None, datastore_name=None): """ upload all files in path :rtype: list[BatchArtifactContentInformationDto] """ if not os.path.isdir(dir_path): raise UserErrorException( "Cannot upload path: {} since it is not a valid directory.". format(dir_path)) paths_to_upload = [] names = [] for pathl, _subdirs, files in os.walk(dir_path): for _file in files: fpath = os.path.join(pathl, _file) paths_to_upload.append(fpath) if path_to_name_fn is not None: name = path_to_name_fn(fpath) else: name = fpath names.append(name) self._logger.debug("Uploading {}".format(names)) result = self.upload_files(paths_to_upload, origin, container, names, datastore_name=datastore_name) return result
def reactivate_experiment(self, experiment_id, new_name=None, caller=None, custom_headers=None, is_async=False): """ Reactivate an archived experiment :param experiment_id: experiment id (required) :type experiment_id: str :param new_name: new experiment name (optional) :type new_name: str :param is_async: execute request asynchronously :type is_async: bool :param caller: caller function name (optional) :type caller: optional[string] :param custom_headers: headers that will be added to the request (optional) :type custom_headers: optional[dict] :return: the return type is based on is_async parameter. If is_async parameter is True, the request is called asynchronously. rtype: ~_restclient.models.ExperimentDto (is_async is False) or azureml._async.AsyncTask (is_async is True) """ if new_name is not None: raise UserErrorException( "Cannot rename an experiment. If the archived experiment name conflicts" " with an active experiment name, you can delete the active experiment" " before unarchiving this experiment.") modify_experiment_dto = ModifyExperimentDto(archive=False) return self.update_experiment(experiment_id, modify_experiment_dto, caller, custom_headers, is_async)
def get_workspace_or_default_name(workspace_name, throw_error=False, subscription_id=None, auth=None, project_path=None): """ Order is 1) Get workspace name from the specified parameter, 2) From project context, 3) Using az configure defaults. :param auth: :type auth: azureml.core.authentication.AbstractAuthentication :param workspace_name: :type workspace_name: str :param throw_error: throw_error = True throws an error if eventual workspace_name=None :type throw_error: bool :return: Returns the provided or default value of the workspace name. """ if workspace_name: return workspace_name project_object = _get_project_object(subscription_id=subscription_id, auth=auth, project_path=project_path) if project_object: return project_object.workspace.name if throw_error: raise UserErrorException( 'Error, default workspace not set and workspace name parameter not provided.' '\nPlease set a default workspace using "az ml folder attach -w myworkspace -g ' 'myresourcegroup" or provide a value for the workspace name parameter.' ) else: return workspace_name
def _handle_http_operation_error(self, operation_error, origin, container, path, prefix=False): """ Handles HttpOperationError received from Artifact Service :param operation_error: the error received :type operation_error: HttpOperationError :param origin: origin component of the artifactId :type origin: str :param container: container component of the artifactId :type container: str :param path: path component of the artifactId :type path: str :param prefix: boolean, true if the path represents a directory, false if a single file :type prefix: bool """ if operation_error.response.status_code == 404: existing_files = self.get_file_paths(origin, container) type_string = "Prefix" if prefix else "File" raise UserErrorException("{0} with path {1} was not found,\n" "available files include: " "{2}.".format(type_string, path, ",".join(existing_files))) else: raise operation_error
def handle_error(error): if error.response.status_code == 404: return UserErrorException( 'Cannot find dataset registered with name "{}"{} in the workspace.' .format( name, '' if version == 'latest' else ' (version: {})'.format(version)))
def remove_tags(self, tags=None): """Remove the specified keys from tags dictionary of this dataset. :param tags: The list of keys to remove. :type tags: builtin.list[str] :return: The updated dataset object. :rtype: typing.Union[azureml.data.TabularDataset, azureml.data.FileDataset] """ if not self._registration or not self._registration.workspace or not self._registration.registered_id: return UserErrorException( 'To remove tags from this dataset it must be registered.') workspace = self._registration.workspace def request(): updatedTags = deepcopy(self._registration.tags) for item in set(tags).intersection(updatedTags): del updatedTags[item] return _restclient(workspace).dataset.update_dataset( workspace.subscription_id, workspace.resource_group, workspace.name, dataset_id=self._registration.registered_id, new_dataset_dto=_dataset_to_dto( self, self.name, self.description, updatedTags, self._registration.registered_id), custom_headers=self._get_telemetry_headers()) success, result = _make_request(request) if not success: raise result result_dto = _dto_to_dataset(workspace, result) self._registration.tags = result_dto.tags return result_dto
def _validate_mode(self, mode): from azureml.core.runconfig import SUPPORTED_DATAREF_MODES from azureml.exceptions import UserErrorException message = "Invalid mode {0}. Only mount, download, upload are supported" if mode not in SUPPORTED_DATAREF_MODES: raise UserErrorException(message.format(mode))
def upload_dir(self, dir_path, path_to_name_fn=None, skip_first_level=False): """ Upload all files in path. :rtype: list[BatchArtifactContentInformationDto] """ if self._run_id is None: raise UserErrorException("Cannot upload when run_id is None") paths_to_upload = [] names = [] for pathl, _subdirs, files in os.walk(dir_path): for _file in files: fpath = os.path.join(pathl, _file) paths_to_upload.append(fpath) if path_to_name_fn is not None: name = path_to_name_fn(fpath) elif skip_first_level: subDir = pathl.split("/", 1)[1] name = os.path.join(subDir, _file) else: name = fpath names.append(name) self._logger.debug("Uploading {}".format(names)) result = self.upload_files(paths_to_upload, names) return result
def attach_folder_to_workspace_and_experiment( workspace=None, experiment_name=None, path=None, # We should enforce a logger logger=None): path = os.path.abspath(path) if os.path.exists(path) and not os.path.isdir(path): raise UserErrorException("The provided path [{}] must be a directory".format(path)) elif not os.path.exists(path): logger.info("Creating non-existent path %s", path) os.makedirs(path, exist_ok=True) logger.debug("Workspace to attach is %s", workspace._workspace_id) if experiment_name is None: path = path.rstrip('\\/') experiment_to_attach = os.path.basename(path) logger.debug("No experiment name was provided") else: experiment_to_attach = experiment_name logger.debug("Attaching folder %s to experiment %s", path, experiment_to_attach) project = workspace._initialize_folder(experiment_to_attach, directory=path) return project._serialize_to_dict()
def _ensure_workspace(self, workspace): if workspace is not None: return workspace if self._registration is None or self._registration.workspace is None: raise UserErrorException( 'The dataset does not belong to a workspace. Please pass in the workspace ' 'from argument.') return self._registration.workspace
def wrapped(self, *args, **kwargs): if self._id is None: raise UserErrorException( "{} doesn't have an id set therefore, the {} cannot " "modify the experiment. Please call the Experiment " "constructor by setting _create_in_cloud to True".format( self, self.__class__.__name__)) return func(self, *args, **kwargs)
def _check_paramiko(): try: import paramiko return paramiko.AuthenticationException except ImportError: raise UserErrorException( "Please install paramiko to use deprecated legacy compute target methods." )
def wait_for_completion(self, show_output=False): """Wait for the model evaluation process to finish. :param show_output: Boolean option to print more verbose output. Defaults to False. :type show_output: bool """ if not (self.workspace and self.create_operation_id): raise UserErrorException('wait_for_completion operation cannot be performed on this object.' 'Make sure the object was created via the appropriate method ' 'in the Model class') operation_state, error, request_id = self._get_operation_state() self.parent_request_id = request_id current_state = operation_state if show_output: sys.stdout.write('{}'.format(current_state)) sys.stdout.flush() while operation_state not in ['Cancelled', 'Succeeded', 'Failed', 'TimedOut']: time.sleep(5) operation_state, error, _ = self._get_operation_state() if show_output: sys.stdout.write('.') if operation_state != current_state: sys.stdout.write('\n{}'.format(operation_state)) current_state = operation_state sys.stdout.flush() sys.stdout.write('\n') sys.stdout.flush() module_logger.info( 'Model {} operation with name {} finished operation {}\n'.format( self.__class__._model_eval_type, self.name, operation_state ) ) if operation_state == 'Failed': if error and 'statusCode' in error and 'message' in error: module_logger.info( 'Model {} failed with\n' 'StatusCode: {}\n' 'Message: {}\n' 'Operation ID: {}\n' 'Request ID: {}\n'.format( self.__class__._model_eval_type, error['statusCode'], error['message'], self.create_operation_id, self.parent_request_id ) ) else: module_logger.info( 'Model profiling failed, unexpected error response:\n' '{}\n' 'Operation ID: {}\n' 'Request ID: {}\n'.format( error, self.create_operation_id, self.parent_request_id) ) self._update_creation_state()
def update_args_and_io(args, inputs, outputs): if isinstance(args, str): return for index in range(len(args)): if isinstance(args[index], _Dataset): raise UserErrorException( "Dataset cannot be used directly in a run. If you are using a FileDataset and " "would like to mount or download the dataset, please call the as_mount or the as_download " "methods on the dataset object. If you would like to use the direct mode, please call the " "as_named_input method on the dataset object to convert the dataset into a " "DatasetConsumptionConfig. Please visit our public documentation for more information on these " "methods and classes at https://docs.microsoft.com/en-us/python/api/azureml-core/azureml.data" ".output_dataset_config.outputfiledatasetconfig?view=azure-ml-py." ) elif isinstance(args[index], Data): raise UserErrorException( "azureml.core.runconfig.Data is not supported in arguments. Only " "DatasetConsumptionConfig is supported. It can be created by calling " "dataset.as_named_input('my_dataset')") elif isinstance(args[index], DatasetConsumptionConfig): dataset = args[index] if dataset.name in inputs: module_logger.warning(( "Dataset with the name {} is already defined in the data section of the " "RunConfiguration. The DatasetConsumptionConfig in the data section will " "be used to materialized the data").format( dataset.name)) else: inputs[dataset.name] = dataset args[index] = _DATASET_ARGUMENT_TEMPLATE.format(dataset.name) elif isinstance(args[index], OutputDatasetConfig): output = args[index] args[index] = _DATASET_OUTPUT_ARGUMENT_TEMPLATE.format( output.name) outputs[output.name] = output elif isinstance(args[index], OutputData): raise UserErrorException( "Arguments does not support OutputData. You need to pass the placeholder " "into arguments which will be replaced with the output directory where " "your script should write the output to. The placeholder has the following " "format: {}:name where name is the key of the OutputData in the " "output_data section of the run " "configuration.".format(_DATASET_OUTPUT_ARGUMENT_TEMPLATE))
def _verify_prefix(prefix): if not prefix: return prefix = prefix.lstrip("./\\") prefix_segments = re.split(r'[/\\]+', prefix) if len(prefix_segments) > 1: raise UserErrorException( "Nested prefix '{}' for Azure File Share is currently not supported." )