示例#1
0
 def request():
     return _restclient(workspace).dataset.unregister_dataset(
         workspace.subscription_id,
         workspace.resource_group,
         workspace.name,
         self.name,
         custom_headers=_custom_headers)
示例#2
0
 def request():
     return _restclient(workspace).dataset.ensure_saved(
         subscription_id=workspace.subscription_id,
         resource_group_name=workspace.resource_group,
         workspace_name=workspace.name,
         dataset=_dataset_to_saved_dataset_dto(self),
         custom_headers=self._get_telemetry_headers())
示例#3
0
    def is_stale(self):
        """Return boolean to describe whether the computed profile is stale or not.

        A Profile is considered to be stale if there is changed in underlying data after the
        profile is computed.
        - if the data source change cannot be detected, TypeError is raised.
        - if the data source was changed after submitting the profile run, the flag will be True;
        - otherwise, the profile matches current data, and the flag will be False.

        :return: boolean to describe whether the computed profile is stale or not.
        :rtype: bool
        """
        from azureml.core import Dataset
        dataset = Dataset.get_by_id(self._workspace, id=self._saved_dataset_id)
        workspace = dataset._ensure_workspace(self._workspace)

        request_dto = ActionRequestDto(
            action_type=_ACTION_TYPE_PROFILE,
            saved_dataset_id=dataset._ensure_saved(workspace),
            arguments={'generate_preview': 'True', 'row_count': '1000'})

        action_result_dto = _restclient(workspace).dataset.get_action_result(
            workspace.subscription_id,
            workspace.resource_group,
            workspace.name,
            dataset_id=_LEGACY_DATASET_ID,
            request=request_dto,
            custom_headers=_custom_headers)

        if action_result_dto.is_up_to_date is None:
            raise AzureMLException(action_result_dto.is_up_to_date_error)

        return not action_result_dto.is_up_to_date
示例#4
0
 def request_for_unregistered():
     return _restclient(workspace).dataset.get_by_id(
         subscription_id=workspace.subscription_id,
         resource_group_name=workspace.resource_group,
         workspace_name=workspace.name,
         id=id,
         resolve_legacy_id=True,
         custom_headers=_custom_headers)
示例#5
0
 def request():
     return _restclient(workspace).dataset.register(
         workspace.subscription_id,
         workspace.resource_group,
         workspace.name,
         dataset_dto=_dataset_to_dto(self, name, description, tags),
         if_exists_ok=create_new_version,
         update_definition_if_exists=create_new_version,
         custom_headers=self._get_telemetry_headers())
示例#6
0
        def request():
            dto = _restclient(workspace).dataset.get_dataset_by_name(
                workspace.subscription_id,
                workspace.resource_group,
                workspace.name,
                dataset_name=name,
                version_id=version,
                custom_headers=_custom_headers)

            return dto
示例#7
0
 def list_dataset(continuation_token):
     return _restclient(workspace).dataset.list(
         subscription_id=workspace.subscription_id,
         resource_group_name=workspace.resource_group,
         workspace_name=workspace.name,
         page_size=100,
         include_latest_definition=True,
         include_invisible=False,
         continuation_token=continuation_token,
         custom_headers=_custom_headers)
示例#8
0
 def request_for_registered():
     return _restclient(
         workspace
     ).dataset.get_datasets_by_saved_dataset_id(
         subscription_id=workspace.subscription_id,
         resource_group_name=workspace.resource_group,
         workspace_name=workspace.name,
         saved_dataset_id=id,
         page_size=
         1,  # just need the 1st (can only be more than one for dataset created in the old age)
         custom_headers=_custom_headers)
示例#9
0
    def get_profile(self, workspace=None):
        """Get data profile from the latest profile run submitted for this or the same dataset in the workspace.

        :param workspace: The workspace where profile run was submitted. Defaults to the workspace of this dataset.
            Required if dataset is not associated to a workspace.
            See https://docs.microsoft.com/en-us/python/api/azureml-core/azureml.core.workspace.workspace
            for more information on workspaces.
        :type workspace: azureml.core.Workspace
        :return: Profile result from the latest profile run of type DatasetProfile.
        :rtype: azureml.data.dataset_profile.DatasetProfile
        """
        workspace = self._ensure_workspace(workspace)
        saved_dataset_id = self._ensure_saved(workspace)

        # arguments [{'generate_preview': 'True', 'row_count': '1000'}] are added to ensure
        # that requestHash is same. The GenerateProfileWithPreview API add these arguments on service side.
        # If any changes are made there, this should also be changed.
        from azureml._restclient.models import ActionRequestDto
        request_dto = ActionRequestDto(action_type=_ACTION_TYPE_PROFILE,
                                       saved_dataset_id=saved_dataset_id,
                                       arguments={
                                           'generate_preview': 'True',
                                           'row_count': '1000'
                                       })

        action_result_dto = _restclient(workspace).dataset.get_action_result(
            workspace.subscription_id,
            workspace.resource_group,
            workspace.name,
            dataset_id=_LEGACY_DATASET_ID,
            request=request_dto,
            custom_headers=_custom_headers)
        result_artifact_ids = action_result_dto.result_artifact_ids
        if result_artifact_ids is None or len(result_artifact_ids) == 0:
            raise AzureMLException(
                'Unable to fetch profile results. Please submit a new profile run.'
            )
        result_artifact = result_artifact_ids[0]
        from azureml._restclient.artifacts_client import ArtifactsClient
        content = ArtifactsClient(
            workspace.service_context).download_artifact_contents_to_string(
                *result_artifact.split("/", 2))
        try:
            from azureml.data.dataset_profile import DatasetProfile
            profile = DatasetProfile(
                saved_dataset_id, action_result_dto.run_id,
                action_result_dto.experiment_name, workspace,
                dataprep().DataProfile._from_json(content))
        except Exception:
            errormsg = 'Unable to fetch profile since profile result is corrupted. Please submit a new profile run.'
            _get_logger().error(errormsg)
            raise AzureMLException(errormsg)

        return profile
示例#10
0
def _submit_profile(dataset_profile_config_object, workspace, experiment_name):
    """Start Profile execution with the given config on the given workspace.

    :param dataset_profile_config_object:
    :param workspace:
    :param experiment_name:
    :param kwargs:
    :return:
    """
    dataset = dataset_profile_config_object._dataset
    compute_target = dataset_profile_config_object._compute_target

    if isinstance(compute_target, ComputeTarget):
        compute_target = compute_target.name
    else:
        compute_target = compute_target
    run_id = 'dataset_' + str(uuid.uuid4())
    saved_dataset_id = dataset._ensure_saved(workspace)
    action_dto = _restclient(workspace).dataset.generate_profile_with_preview(
        workspace.subscription_id,
        workspace.resource_group,
        workspace.name,
        id=saved_dataset_id,
        compute_target=compute_target,
        experiment_name=experiment_name,
        run_id=run_id,
        custom_headers=_custom_headers)

    if dataset_profile_config_object._compute_target == _LOCAL_COMPUTE:
        with tempfile.TemporaryDirectory() as temp_dir:
            script = os.path.join(temp_dir, 'profile_run_script.py')
            copyfile(
                os.path.join(os.path.dirname(__file__),
                             '_profile_run_script.py'), script)
            run_local = RunConfiguration()
            run_local.environment.python.user_managed_dependencies = True
            run_local.environment.python.interpreter_path = sys.executable
            script_config = ScriptRunConfig(source_directory=temp_dir,
                                            script="profile_run_script.py",
                                            arguments=[
                                                action_dto.dataset_id,
                                                action_dto.action_id,
                                                saved_dataset_id
                                            ],
                                            run_config=run_local)
            experiment = Experiment(workspace, experiment_name)
            experiment.submit(script_config, run_id=run_id)
    else:
        experiment = Experiment(workspace, action_dto.experiment_name)
        run_id = action_dto.run_id
    run = get_run(experiment, run_id)
    return DatasetProfileRun(workspace, dataset, run)
示例#11
0
        def request():
            updatedTags = deepcopy(self._registration.tags)
            for item in set(tags).intersection(updatedTags):
                del updatedTags[item]

            return _restclient(workspace).dataset.update_dataset(
                workspace.subscription_id,
                workspace.resource_group,
                workspace.name,
                dataset_id=self._registration.registered_id,
                new_dataset_dto=_dataset_to_dto(
                    self, self.name, self.description, updatedTags,
                    self._registration.registered_id),
                custom_headers=self._get_telemetry_headers())
示例#12
0
        def request():
            updated_description = description
            updated_tags = tags
            if description is None:
                updated_description = self._registration.description
            if tags is None:
                updated_tags = self._registration.tags

            return _restclient(workspace).dataset.update_dataset(
                workspace.subscription_id,
                workspace.resource_group,
                workspace.name,
                dataset_id=self._registration.registered_id,
                new_dataset_dto=_dataset_to_dto(
                    self, self.name, updated_description, updated_tags,
                    self._registration.registered_id),
                custom_headers=self._get_telemetry_headers())
示例#13
0
    def profile(self):
        """Retrieve the data profile from result of this run, meanwhile checking if it matches current data.

        :return: A tuple of values. The first value is the data profile result from the completed run. The second
            value is a flag indicating whether the profile matches current data:
                - if the data source change cannot be detected, the flag will be None;
                - if the data source was changed after submitting the profile run, the flag will be False;
                - otherwise, the profile matches current data, and the flag will be True.
        :rtype: (azureml.dataprep.DataProfile, bool)
        """
        if self.status != 'Completed':
            return (None, None)
        action_dto = _restclient(self._workspace).dataset.get_action_by_id(
            self._workspace.subscription_id,
            self._workspace.resource_group,
            self._workspace.name,
            dataset_id=_LEGACY_DATASET_ID,
            action_id=self._action_id,
            _custom_headers=_custom_headers)
        return _profile_from_action(self._workspace, action_dto)
示例#14
0
        def request():
            duplicate_keys = []
            for item in set(tags).intersection(self._registration.tags):
                if self._registration.tags[item] != tags[item]:
                    duplicate_keys.append(item)
            if len(duplicate_keys) > 0:
                raise UserErrorException(
                    ('Dataset already contains different values for tags '
                     'with the following keys {}').format(duplicate_keys))

            updatedTags = deepcopy(self._registration.tags)
            updatedTags.update(tags)

            return _restclient(workspace).dataset.update_dataset(
                workspace.subscription_id,
                workspace.resource_group,
                workspace.name,
                dataset_id=self._registration.registered_id,
                new_dataset_dto=_dataset_to_dto(
                    self, self.name, self.description, updatedTags,
                    self._registration.registered_id),
                custom_headers=self._get_telemetry_headers())
示例#15
0
    def get_profile_runs(self, workspace=None):
        """Return previous profile runs associated with this or same dataset in the workspace.

        :param workspace: The workspace where profile run was submitted. Defaults to the workspace of this dataset.
            Required if dataset is not associated to a workspace.
            See https://docs.microsoft.com/en-us/python/api/azureml-core/azureml.core.workspace.workspace
            for more information on workspaces.
        :type workspace: azureml.core.Workspace
        :return: iterator object of type azureml.core.Run.
        :rtype: iter(azureml.core.Run)
        """
        workspace = self._ensure_workspace(workspace)
        from azureml._restclient.models import ActionRequestDto
        request_dto = ActionRequestDto(
            action_type=_ACTION_TYPE_PROFILE,
            saved_dataset_id=self._ensure_saved(workspace),
            arguments={
                'generate_preview': 'True',
                'row_count': '1000'
            })

        continuation_token = None
        paginated_action_dto_list = []
        index = 0

        while index == 0 or continuation_token is not None:
            paginated_action_dto = _restclient(
                workspace).dataset.list_actions_from_request(
                    workspace.subscription_id,
                    workspace.resource_group,
                    workspace.name,
                    dataset_id=_LEGACY_DATASET_ID,
                    request=request_dto,
                    count=1000,
                    custom_headers=_custom_headers,
                    continuation_token=continuation_token)

            index = index + 1
            for item in paginated_action_dto.value:
                paginated_action_dto_list.append(item)
            continuation_token = paginated_action_dto.continuation_token

        if not paginated_action_dto_list:
            raise AzureMLException(
                'Unable to find any run information. Please submit a new profile run.'
            )

        run_list = []
        for item in paginated_action_dto_list:
            flag = True
            # This is done to ensure backward compatibility. Earlier we do not persist
            # run_id for local runs. Hence for older runs run_id is empty.
            if item.run_id is None:
                continue
            from azureml.core import Experiment, get_run
            experiment = Experiment(workspace, item.experiment_name)
            try:
                run = get_run(experiment, item.run_id)
            except Exception:
                flag = False
            if flag:
                run_list.append(run)

        return iter(run_list)