def request(): return _restclient(workspace).dataset.unregister_dataset( workspace.subscription_id, workspace.resource_group, workspace.name, self.name, custom_headers=_custom_headers)
def request(): return _restclient(workspace).dataset.ensure_saved( subscription_id=workspace.subscription_id, resource_group_name=workspace.resource_group, workspace_name=workspace.name, dataset=_dataset_to_saved_dataset_dto(self), custom_headers=self._get_telemetry_headers())
def is_stale(self): """Return boolean to describe whether the computed profile is stale or not. A Profile is considered to be stale if there is changed in underlying data after the profile is computed. - if the data source change cannot be detected, TypeError is raised. - if the data source was changed after submitting the profile run, the flag will be True; - otherwise, the profile matches current data, and the flag will be False. :return: boolean to describe whether the computed profile is stale or not. :rtype: bool """ from azureml.core import Dataset dataset = Dataset.get_by_id(self._workspace, id=self._saved_dataset_id) workspace = dataset._ensure_workspace(self._workspace) request_dto = ActionRequestDto( action_type=_ACTION_TYPE_PROFILE, saved_dataset_id=dataset._ensure_saved(workspace), arguments={'generate_preview': 'True', 'row_count': '1000'}) action_result_dto = _restclient(workspace).dataset.get_action_result( workspace.subscription_id, workspace.resource_group, workspace.name, dataset_id=_LEGACY_DATASET_ID, request=request_dto, custom_headers=_custom_headers) if action_result_dto.is_up_to_date is None: raise AzureMLException(action_result_dto.is_up_to_date_error) return not action_result_dto.is_up_to_date
def request_for_unregistered(): return _restclient(workspace).dataset.get_by_id( subscription_id=workspace.subscription_id, resource_group_name=workspace.resource_group, workspace_name=workspace.name, id=id, resolve_legacy_id=True, custom_headers=_custom_headers)
def request(): return _restclient(workspace).dataset.register( workspace.subscription_id, workspace.resource_group, workspace.name, dataset_dto=_dataset_to_dto(self, name, description, tags), if_exists_ok=create_new_version, update_definition_if_exists=create_new_version, custom_headers=self._get_telemetry_headers())
def request(): dto = _restclient(workspace).dataset.get_dataset_by_name( workspace.subscription_id, workspace.resource_group, workspace.name, dataset_name=name, version_id=version, custom_headers=_custom_headers) return dto
def list_dataset(continuation_token): return _restclient(workspace).dataset.list( subscription_id=workspace.subscription_id, resource_group_name=workspace.resource_group, workspace_name=workspace.name, page_size=100, include_latest_definition=True, include_invisible=False, continuation_token=continuation_token, custom_headers=_custom_headers)
def request_for_registered(): return _restclient( workspace ).dataset.get_datasets_by_saved_dataset_id( subscription_id=workspace.subscription_id, resource_group_name=workspace.resource_group, workspace_name=workspace.name, saved_dataset_id=id, page_size= 1, # just need the 1st (can only be more than one for dataset created in the old age) custom_headers=_custom_headers)
def get_profile(self, workspace=None): """Get data profile from the latest profile run submitted for this or the same dataset in the workspace. :param workspace: The workspace where profile run was submitted. Defaults to the workspace of this dataset. Required if dataset is not associated to a workspace. See https://docs.microsoft.com/en-us/python/api/azureml-core/azureml.core.workspace.workspace for more information on workspaces. :type workspace: azureml.core.Workspace :return: Profile result from the latest profile run of type DatasetProfile. :rtype: azureml.data.dataset_profile.DatasetProfile """ workspace = self._ensure_workspace(workspace) saved_dataset_id = self._ensure_saved(workspace) # arguments [{'generate_preview': 'True', 'row_count': '1000'}] are added to ensure # that requestHash is same. The GenerateProfileWithPreview API add these arguments on service side. # If any changes are made there, this should also be changed. from azureml._restclient.models import ActionRequestDto request_dto = ActionRequestDto(action_type=_ACTION_TYPE_PROFILE, saved_dataset_id=saved_dataset_id, arguments={ 'generate_preview': 'True', 'row_count': '1000' }) action_result_dto = _restclient(workspace).dataset.get_action_result( workspace.subscription_id, workspace.resource_group, workspace.name, dataset_id=_LEGACY_DATASET_ID, request=request_dto, custom_headers=_custom_headers) result_artifact_ids = action_result_dto.result_artifact_ids if result_artifact_ids is None or len(result_artifact_ids) == 0: raise AzureMLException( 'Unable to fetch profile results. Please submit a new profile run.' ) result_artifact = result_artifact_ids[0] from azureml._restclient.artifacts_client import ArtifactsClient content = ArtifactsClient( workspace.service_context).download_artifact_contents_to_string( *result_artifact.split("/", 2)) try: from azureml.data.dataset_profile import DatasetProfile profile = DatasetProfile( saved_dataset_id, action_result_dto.run_id, action_result_dto.experiment_name, workspace, dataprep().DataProfile._from_json(content)) except Exception: errormsg = 'Unable to fetch profile since profile result is corrupted. Please submit a new profile run.' _get_logger().error(errormsg) raise AzureMLException(errormsg) return profile
def _submit_profile(dataset_profile_config_object, workspace, experiment_name): """Start Profile execution with the given config on the given workspace. :param dataset_profile_config_object: :param workspace: :param experiment_name: :param kwargs: :return: """ dataset = dataset_profile_config_object._dataset compute_target = dataset_profile_config_object._compute_target if isinstance(compute_target, ComputeTarget): compute_target = compute_target.name else: compute_target = compute_target run_id = 'dataset_' + str(uuid.uuid4()) saved_dataset_id = dataset._ensure_saved(workspace) action_dto = _restclient(workspace).dataset.generate_profile_with_preview( workspace.subscription_id, workspace.resource_group, workspace.name, id=saved_dataset_id, compute_target=compute_target, experiment_name=experiment_name, run_id=run_id, custom_headers=_custom_headers) if dataset_profile_config_object._compute_target == _LOCAL_COMPUTE: with tempfile.TemporaryDirectory() as temp_dir: script = os.path.join(temp_dir, 'profile_run_script.py') copyfile( os.path.join(os.path.dirname(__file__), '_profile_run_script.py'), script) run_local = RunConfiguration() run_local.environment.python.user_managed_dependencies = True run_local.environment.python.interpreter_path = sys.executable script_config = ScriptRunConfig(source_directory=temp_dir, script="profile_run_script.py", arguments=[ action_dto.dataset_id, action_dto.action_id, saved_dataset_id ], run_config=run_local) experiment = Experiment(workspace, experiment_name) experiment.submit(script_config, run_id=run_id) else: experiment = Experiment(workspace, action_dto.experiment_name) run_id = action_dto.run_id run = get_run(experiment, run_id) return DatasetProfileRun(workspace, dataset, run)
def request(): updatedTags = deepcopy(self._registration.tags) for item in set(tags).intersection(updatedTags): del updatedTags[item] return _restclient(workspace).dataset.update_dataset( workspace.subscription_id, workspace.resource_group, workspace.name, dataset_id=self._registration.registered_id, new_dataset_dto=_dataset_to_dto( self, self.name, self.description, updatedTags, self._registration.registered_id), custom_headers=self._get_telemetry_headers())
def request(): updated_description = description updated_tags = tags if description is None: updated_description = self._registration.description if tags is None: updated_tags = self._registration.tags return _restclient(workspace).dataset.update_dataset( workspace.subscription_id, workspace.resource_group, workspace.name, dataset_id=self._registration.registered_id, new_dataset_dto=_dataset_to_dto( self, self.name, updated_description, updated_tags, self._registration.registered_id), custom_headers=self._get_telemetry_headers())
def profile(self): """Retrieve the data profile from result of this run, meanwhile checking if it matches current data. :return: A tuple of values. The first value is the data profile result from the completed run. The second value is a flag indicating whether the profile matches current data: - if the data source change cannot be detected, the flag will be None; - if the data source was changed after submitting the profile run, the flag will be False; - otherwise, the profile matches current data, and the flag will be True. :rtype: (azureml.dataprep.DataProfile, bool) """ if self.status != 'Completed': return (None, None) action_dto = _restclient(self._workspace).dataset.get_action_by_id( self._workspace.subscription_id, self._workspace.resource_group, self._workspace.name, dataset_id=_LEGACY_DATASET_ID, action_id=self._action_id, _custom_headers=_custom_headers) return _profile_from_action(self._workspace, action_dto)
def request(): duplicate_keys = [] for item in set(tags).intersection(self._registration.tags): if self._registration.tags[item] != tags[item]: duplicate_keys.append(item) if len(duplicate_keys) > 0: raise UserErrorException( ('Dataset already contains different values for tags ' 'with the following keys {}').format(duplicate_keys)) updatedTags = deepcopy(self._registration.tags) updatedTags.update(tags) return _restclient(workspace).dataset.update_dataset( workspace.subscription_id, workspace.resource_group, workspace.name, dataset_id=self._registration.registered_id, new_dataset_dto=_dataset_to_dto( self, self.name, self.description, updatedTags, self._registration.registered_id), custom_headers=self._get_telemetry_headers())
def get_profile_runs(self, workspace=None): """Return previous profile runs associated with this or same dataset in the workspace. :param workspace: The workspace where profile run was submitted. Defaults to the workspace of this dataset. Required if dataset is not associated to a workspace. See https://docs.microsoft.com/en-us/python/api/azureml-core/azureml.core.workspace.workspace for more information on workspaces. :type workspace: azureml.core.Workspace :return: iterator object of type azureml.core.Run. :rtype: iter(azureml.core.Run) """ workspace = self._ensure_workspace(workspace) from azureml._restclient.models import ActionRequestDto request_dto = ActionRequestDto( action_type=_ACTION_TYPE_PROFILE, saved_dataset_id=self._ensure_saved(workspace), arguments={ 'generate_preview': 'True', 'row_count': '1000' }) continuation_token = None paginated_action_dto_list = [] index = 0 while index == 0 or continuation_token is not None: paginated_action_dto = _restclient( workspace).dataset.list_actions_from_request( workspace.subscription_id, workspace.resource_group, workspace.name, dataset_id=_LEGACY_DATASET_ID, request=request_dto, count=1000, custom_headers=_custom_headers, continuation_token=continuation_token) index = index + 1 for item in paginated_action_dto.value: paginated_action_dto_list.append(item) continuation_token = paginated_action_dto.continuation_token if not paginated_action_dto_list: raise AzureMLException( 'Unable to find any run information. Please submit a new profile run.' ) run_list = [] for item in paginated_action_dto_list: flag = True # This is done to ensure backward compatibility. Earlier we do not persist # run_id for local runs. Hence for older runs run_id is empty. if item.run_id is None: continue from azureml.core import Experiment, get_run experiment = Experiment(workspace, item.experiment_name) try: run = get_run(experiment, item.run_id) except Exception: flag = False if flag: run_list.append(run) return iter(run_list)