Exemplo n.º 1
0
    def get_profile(self, workspace=None):
        """Get data profile from the latest profile run submitted for this or the same dataset in the workspace.

        :param workspace: The workspace where profile run was submitted. Defaults to the workspace of this dataset.
            Required if dataset is not associated to a workspace.
            See https://docs.microsoft.com/en-us/python/api/azureml-core/azureml.core.workspace.workspace
            for more information on workspaces.
        :type workspace: azureml.core.Workspace
        :return: Profile result from the latest profile run of type DatasetProfile.
        :rtype: azureml.data.dataset_profile.DatasetProfile
        """
        workspace = self._ensure_workspace(workspace)
        saved_dataset_id = self._ensure_saved(workspace)

        # arguments [{'generate_preview': 'True', 'row_count': '1000'}] are added to ensure
        # that requestHash is same. The GenerateProfileWithPreview API add these arguments on service side.
        # If any changes are made there, this should also be changed.
        from azureml._restclient.models import ActionRequestDto
        request_dto = ActionRequestDto(action_type=_ACTION_TYPE_PROFILE,
                                       saved_dataset_id=saved_dataset_id,
                                       arguments={
                                           'generate_preview': 'True',
                                           'row_count': '1000'
                                       })

        action_result_dto = _restclient(workspace).dataset.get_action_result(
            workspace.subscription_id,
            workspace.resource_group,
            workspace.name,
            dataset_id=_LEGACY_DATASET_ID,
            request=request_dto,
            custom_headers=_custom_headers)
        result_artifact_ids = action_result_dto.result_artifact_ids
        if result_artifact_ids is None or len(result_artifact_ids) == 0:
            raise AzureMLException(
                'Unable to fetch profile results. Please submit a new profile run.'
            )
        result_artifact = result_artifact_ids[0]
        from azureml._restclient.artifacts_client import ArtifactsClient
        content = ArtifactsClient(
            workspace.service_context).download_artifact_contents_to_string(
                *result_artifact.split("/", 2))
        try:
            from azureml.data.dataset_profile import DatasetProfile
            profile = DatasetProfile(
                saved_dataset_id, action_result_dto.run_id,
                action_result_dto.experiment_name, workspace,
                dataprep().DataProfile._from_json(content))
        except Exception:
            errormsg = 'Unable to fetch profile since profile result is corrupted. Please submit a new profile run.'
            _get_logger().error(errormsg)
            raise AzureMLException(errormsg)

        return profile
Exemplo n.º 2
0
    def is_stale(self):
        """Return boolean to describe whether the computed profile is stale or not.

        A Profile is considered to be stale if there is changed in underlying data after the
        profile is computed.
        - if the data source change cannot be detected, TypeError is raised.
        - if the data source was changed after submitting the profile run, the flag will be True;
        - otherwise, the profile matches current data, and the flag will be False.

        :return: boolean to describe whether the computed profile is stale or not.
        :rtype: bool
        """
        from azureml.core import Dataset
        dataset = Dataset.get_by_id(self._workspace, id=self._saved_dataset_id)
        workspace = dataset._ensure_workspace(self._workspace)

        request_dto = ActionRequestDto(
            action_type=_ACTION_TYPE_PROFILE,
            saved_dataset_id=dataset._ensure_saved(workspace),
            arguments={'generate_preview': 'True', 'row_count': '1000'})

        action_result_dto = _restclient(workspace).dataset.get_action_result(
            workspace.subscription_id,
            workspace.resource_group,
            workspace.name,
            dataset_id=_LEGACY_DATASET_ID,
            request=request_dto,
            custom_headers=_custom_headers)

        if action_result_dto.is_up_to_date is None:
            raise AzureMLException(action_result_dto.is_up_to_date_error)

        return not action_result_dto.is_up_to_date
Exemplo n.º 3
0
def upload_blob_from_stream(stream,
                            url,
                            content_type=None,
                            session=None,
                            timeout=None,
                            backoff=None,
                            retries=None):
    # TODO add support for upload without azure.storage
    from azureml._vendor.azure_storage.blob import BlockBlobService
    from azureml._vendor.azure_storage.blob.models import ContentSettings
    sas_token, account_name, endpoint_suffix, container_name, blob_name = get_block_blob_service_credentials(
        url)
    content_settings = ContentSettings(content_type=content_type)
    blob_service = BlockBlobService(account_name=account_name,
                                    sas_token=sas_token,
                                    request_session=session,
                                    endpoint_suffix=endpoint_suffix)

    reset_func = StreamResetFunction(stream.tell())

    # Seek to end of stream to validate uploaded blob size matches the local stream size
    stream.seek(0, os.SEEK_END)
    file_size = stream.tell()
    reset_func(stream)

    try:
        from azureml._restclient.clientbase import execute_func_with_reset
        execute_func_with_reset(backoff,
                                retries,
                                blob_service.create_blob_from_stream,
                                reset_func,
                                container_name=container_name,
                                blob_name=blob_name,
                                stream=stream,
                                content_settings=content_settings,
                                timeout=timeout,
                                validate_content=True)
    except AzureHttpError as e:
        if e.status_code == 403:
            azureml_error = AzureMLError.create(
                AuthorizationStorageAccount,
                account_name=account_name,
                container_name=container_name,
                status_code=e.status_code  # , error_code=e.error_code
            )  # error code not present in AzureHttpError
            raise AzureMLException._with_error(azureml_error,
                                               inner_exception=e)
        else:
            raise

    blob_size = blob_service.get_blob_properties(
        container_name, blob_name).properties.content_length
    module_logger.debug("Uploaded blob {} with size {}, file size {}.".format(
        blob_name, blob_size, file_size))
Exemplo n.º 4
0
def _dataprep_error_handler(e, message, is_dprep_exception):
    user_exception_list = ["Authentication", "NotFound", "Validation", "FieldNotFound",
                           "AlreadyExists", "FieldConflict", "StepTranslation", "DataError",
                           "NoColumnsError", "Assertion", "AuthenticationContextMismatch",
                           "CreateTable", "WriteTable"]

    if is_dprep_exception:
        message = message + " ErrorCode: {}".format(e.error_code)
        for item in user_exception_list:
            if _contains(item, getattr(e, 'error_code', 'Unexpected')):
                raise UserErrorException(message, inner_exception=e)

    raise AzureMLException(message, inner_exception=e)
Exemplo n.º 5
0
    def flush(self, source, timeout_seconds=None):
        with self._log_context("WaitFlushSource:{}".format(source)) as log_context:

            if timeout_seconds is None:
                log_context.debug("Overriding default flush timeout from None to {}".
                                  format(self._flush_timeout_seconds))
                timeout_seconds = self._flush_timeout_seconds
            else:
                log_context.debug("flush timeout {} is different from task queue timeout {}, using flush timeout".
                                  format(timeout_seconds, self._flush_timeout_seconds))

            start_time = time.time()

            #  Take tasks off of the queue
            tasks_to_wait = []
            while True:
                try:
                    tasks_to_wait.append(self._tasks.get_nowait())
                except Empty:
                    break

            message = ""
            timeout_time = start_time + timeout_seconds

            log_context.debug("Waiting {} seconds on tasks: {}.".format(timeout_seconds, tasks_to_wait))

            not_done = True

            while not_done and time.time() <= timeout_time:
                completed_tasks = [task for task in tasks_to_wait if task.done()]
                tasks_to_wait = [task for task in tasks_to_wait if not task.done()]
                not_done = len(tasks_to_wait) != 0

                self._results.extend((task.wait(awaiter_name=self.identity) for task in completed_tasks))

                if not_done:
                    for task in tasks_to_wait:
                        message += "Waiting on task: {}.\n".format(task.ident)
                    message += "{} tasks left. Current duration of flush {} seconds.\n".format(
                        len(tasks_to_wait), time.time() - start_time)

                    time.sleep(.25)

            self._logger.debug(message)

            # Reach this case on timeout
            if not_done:
                azureml_error = AzureMLError.create(
                    FlushTaskTimeout, timeout_seconds=timeout_seconds
                )
                raise AzureMLException._with_error(azureml_error)
Exemplo n.º 6
0
    def __init__(self, saved_dataset_id, run_id, experiment_name, workspace, profile):
        """Create DatasetProfile object.

        :param saved_dataset_id: The id of the dataset on which profile is computed.
        :type saved_dataset_id: str
        :param run_id: The run id for the experiment which is used to compute the profile.
        :type run_id: str
        :param experiment_name: The name of the submitted experiment used to compute the profile.
        :type experiment_name: str
        :param workspace: Workspace which the profile run belongs to.
            See https://docs.microsoft.com/en-us/python/api/azureml-core/azureml.core.workspace.workspace
            for more information on workspaces.
        :type workspace: azureml.core.workspace.Workspace
        :param profile: Profile result from the latest profile run of type DataProfile.
        :type profile: azureml.dataprep.DataProfile
        """
        if saved_dataset_id is None:
            _get_logger().error('saved_dataset_id is none')
            raise AzureMLException("Unable to fetch profile results. Please submit a new profile run.")
        if run_id is None:
            _get_logger().error('run_id is none')
            raise AzureMLException("Unable to fetch profile results. Please submit a new profile run.")
        if experiment_name is None:
            _get_logger().error('experiment_name is none')
            raise AzureMLException("Unable to fetch profile results. Please submit a new profile run.")
        if workspace is None:
            _get_logger().error('workspace is none')
            raise AzureMLException("Unable to fetch profile results. Please submit a new profile run.")
        if profile is None:
            _get_logger().error('profile is none')
            raise AzureMLException("Unable to fetch profile results. Please submit a new profile run.")

        self._saved_dataset_id = saved_dataset_id
        self._run_id = run_id
        self._experiment_name = experiment_name
        self._workspace = workspace
        self._profile = profile
Exemplo n.º 7
0
 def exec_func():
     metadata_dict = blob_service.get_blob_to_path(
         container_name=container_name,
         blob_name=blob_name,
         file_path=path,
         max_connections=max_concurrency,
         validate_content=_validate_check_sum)
     file_size = os.stat(path).st_size
     module_logger.debug("Downloaded file {} with size {}.".format(
         path, file_size))
     content_length = metadata_dict.properties.content_length
     if (content_length != file_size):
         azureml_error = AzureMLError.create(
             BadDataDownloaded,
             file_size=file_size,
             content_length=content_length)
         raise AzureMLException._with_error(azureml_error)
Exemplo n.º 8
0
def _retry(exec_func,
           clean_up_func=(lambda: None),
           max_retries=5,
           exceptions=(Exception)):
    """
    A helper function for retry

    :param exec_func: the execution function that runs inside retry mechnism
    :type exec_func: func
    :param clean_up_func: a clean up function that runs inside final statement
    :type clean_up_func: func
    :param max_retries: the number of retries
    :type max_retries: int
    :param exceptions: the exceptions to handle in execution function
    :type stream: Tuple[Type[Exception]]
    :return: results from the return of execution func
    :rtype: AnyType
    """
    wait_time = 2
    retries = 0
    while retries < max_retries:
        try:
            return exec_func()
        except exceptions as request_exception:
            retries += 1
            module_logger.debug(
                'retry has happened in the {} times'.format(retries))
            if retries < max_retries:
                module_logger.debug(
                    'RequestException or HTTPError raised in download_file with message: {}'
                    .format(request_exception))
                time.sleep(wait_time)
                wait_time = wait_time**2
                continue
            else:
                module_logger.error(
                    'Failed to download file with error: {}'.format(
                        request_exception))
                azureml_error = AzureMLError.create(DownloadFailed,
                                                    error=request_exception)
                raise AzureMLException._with_error(azureml_error)
        finally:
            clean_up_func()
Exemplo n.º 9
0
    def delete_experiment(self, experiment_id, timeout_seconds=600):
        """
        delete empty experiment by experiment_id
        :return: when the delete operation is complete
        """
        call_kwargs = {'raw': True}

        # initial response could be 200 or 202
        initial_response = self._execute_with_workspace_arguments(
            self._client.experiment.delete,
            experiment_id=experiment_id,
            **call_kwargs)

        from .polling import AzureMLPolling
        from msrest.polling.poller import LROPoller

        # "AzureML polling" is a name for the 202/200/location-header contract
        arm_poller = AzureMLPolling(
            timeout=
            5,  # timeout here is actually the delay between polls, bad name
            lro_options={'final-state-via': 'location'})

        # raise an exception to the user when the timeout expires and still got a 202
        def deserialization_callback(response):
            return 1 if response is not None and response.status_code == 202 else 0

        poller = LROPoller(self._client.experiment._client, initial_response,
                           deserialization_callback, arm_poller)

        # this call blocks until the async operation returns 200
        result = poller.result(timeout_seconds)
        if result == 1:
            azureml_error = AzureMLError.create(
                FailedIdWithinSeconds,
                experiment_id=experiment_id,
                timeout_seconds=timeout_seconds)
            raise AzureMLException._with_error(azureml_error)
Exemplo n.º 10
0
    def get_profile_runs(self, workspace=None):
        """Return previous profile runs associated with this or same dataset in the workspace.

        :param workspace: The workspace where profile run was submitted. Defaults to the workspace of this dataset.
            Required if dataset is not associated to a workspace.
            See https://docs.microsoft.com/en-us/python/api/azureml-core/azureml.core.workspace.workspace
            for more information on workspaces.
        :type workspace: azureml.core.Workspace
        :return: iterator object of type azureml.core.Run.
        :rtype: iter(azureml.core.Run)
        """
        workspace = self._ensure_workspace(workspace)
        from azureml._restclient.models import ActionRequestDto
        request_dto = ActionRequestDto(
            action_type=_ACTION_TYPE_PROFILE,
            saved_dataset_id=self._ensure_saved(workspace),
            arguments={
                'generate_preview': 'True',
                'row_count': '1000'
            })

        continuation_token = None
        paginated_action_dto_list = []
        index = 0

        while index == 0 or continuation_token is not None:
            paginated_action_dto = _restclient(
                workspace).dataset.list_actions_from_request(
                    workspace.subscription_id,
                    workspace.resource_group,
                    workspace.name,
                    dataset_id=_LEGACY_DATASET_ID,
                    request=request_dto,
                    count=1000,
                    custom_headers=_custom_headers,
                    continuation_token=continuation_token)

            index = index + 1
            for item in paginated_action_dto.value:
                paginated_action_dto_list.append(item)
            continuation_token = paginated_action_dto.continuation_token

        if not paginated_action_dto_list:
            raise AzureMLException(
                'Unable to find any run information. Please submit a new profile run.'
            )

        run_list = []
        for item in paginated_action_dto_list:
            flag = True
            # This is done to ensure backward compatibility. Earlier we do not persist
            # run_id for local runs. Hence for older runs run_id is empty.
            if item.run_id is None:
                continue
            from azureml.core import Experiment, get_run
            experiment = Experiment(workspace, item.experiment_name)
            try:
                run = get_run(experiment, item.run_id)
            except Exception:
                flag = False
            if flag:
                run_list.append(run)

        return iter(run_list)
Exemplo n.º 11
0
    def mount(self, mount_point=None, **kwargs):
        """Create a context manager for mounting file streams defined by the dataset as local files.

        .. remarks::

            A context manager will be returned to manage the lifecycle of the mount. To mount, you will need to
            enter the context manager and to unmount, exit from the context manager.

            Mount is only supported on Unix or Unix-like operating systems with the native package libfuse installed.
            If you are running inside a docker container, the docker container must be started with the `--privileged`
            flag or started with `--cap-add SYS_ADMIN --device /dev/fuse`.

           .. code-block:: python

                datastore = Datastore.get(workspace, 'workspaceblobstore')
                dataset = Dataset.File.from_files((datastore, 'animals/dog/year-*/*.jpg'))

                with dataset.mount() as mount_context:
                    # list top level mounted files and folders in the dataset
                    os.listdir(mount_context.mount_point)

                # You can also use the start and stop methods
                mount_context = dataset.mount()
                mount_context.start()  # this will mount the file streams
                mount_context.stop()  # this will unmount the file streams

           If target_path starts with a /, then it will be treated as an absolute path. If it doesn't start
           with a /, then it will be treated as a relative path relative to the current working directory.

        :param mount_point: The local directory to mount the files to. If None, the data will be mounted into a
            temporary directory, which you can find by calling the `MountContext.mount_point` instance method.
        :type mount_point: str
        :return: Returns a context manager for managing the lifecycle of the mount.
        :rtype: azureml.dataprep.fuse.daemon.MountContext
        """
        try:
            mount = dataprep_fuse().mount
        except ValueError as e:
            if 'Invalid mount arguments' in str(e):
                raise UserErrorException(e)
            raise AzureMLException(
                "Execution failed unexpectedly due to: {}".format(str(e)))
        except OSError as e:
            raise UserErrorException(
                'Mount is only supported on Unix or Unix-like operating systems with the '
                'native package libfuse installed. For more information, please refer to the '
                'remarks section of FileDataset.mount\'s documentation. Execution failed'
                'unexpectedly due to {}'.format(e.__class__.__name__))
        except Exception as e:
            raise AzureMLException(
                "Mount failed unexpectedly due to: {}".format(str(e)))

        mount_point = _ensure_path(mount_point)
        if os.path.ismount(mount_point):
            raise UserErrorException(
                '"{0}" is already mounted. Run `sudo umount "{0}"` to unmount it.'
                .format(mount_point))

        if not os.path.exists(mount_point):
            os.makedirs(mount_point)

        invocation_id = str(uuid.uuid4())
        dataflow = get_dataflow_for_execution(self._dataflow,
                                              'mount.find_prefix',
                                              'FileDataset',
                                              invocation_id=invocation_id)
        base_path = _find_path_prefix(dataflow)
        dataflow = get_dataflow_for_execution(self._dataflow,
                                              'mount',
                                              'FileDataset',
                                              invocation_id=invocation_id)
        mount_options = kwargs.get('mount_options', None)
        skip_validate = kwargs.get('skip_validate', False)

        if not skip_validate:
            try:
                is_invalid = dataflow.has_invalid_source(
                    return_validation_error=True)
                if is_invalid is not False:  # This means that the source is invalid
                    raise UserErrorException(
                        "Cannot mount Dataset(id='{}', name='{}', version={}). "
                        "Source of the dataset is either not "
                        "accessible or does not contain any data. "
                        "Error Message: {}".format(self.id, self.name,
                                                   self.version, is_invalid))
            except TypeError:
                # This catch is for backwards compatibility. There are valid version combinations of dataprep
                # and core where dataflow.has_invalid_source will not have the return_validation_error parameter,
                # which the above call will throw a TypeError.
                if dataflow.has_invalid_source(
                ):  # This means that the source is invalid
                    raise UserErrorException(
                        "Cannot mount dataset. Source of the dataset is either not "
                        "accessible or does not contain any data. ")
            except AttributeError:
                # This catch is for backwards compatibility. There are valid version combinations of dataprep
                # and core where Dataflow will not have the has_invalid_source method.
                pass
            except UserErrorException:
                raise
            except AzureMLException:
                raise
            except Exception as e:
                dataset_info = None if self.id is None else {
                    'id': self.id,
                    'name': self.name,
                    'version': self.version
                }
                message, is_dprep_exception = _construct_message_and_check_exception_type(
                    e, dataset_info, "mount")
                _logger.error(message)
                _dataprep_error_handler(e, message, is_dprep_exception)

        return mount(dataflow=dataflow,
                     files_column='Path',
                     mount_point=mount_point,
                     base_path=base_path,
                     options=mount_options,
                     foreground=False,
                     invocation_id=invocation_id)
Exemplo n.º 12
0
    def get_child_runs(self,
                       root_run_id,
                       recursive=False,
                       _filter_on_server=False,
                       page_size=DEFAULT_PAGE_SIZE,
                       order_by=None,
                       caller=None,
                       custom_headers=None,
                       **kwargs):
        """
        Get child runs by current run_id
        :param root_run_id: optimization id for hierarchy(required)
        :type root_run_id: str
        :param recursive: fetch grandchildren and further descendants(required)
        :type recursive: bool
        :param page_size: number of dto returned by one request (optional)
        :type page_size: int
        :param order_by: keys to sort return values, ('sort_key', 'asc'/'desc')(optional)
        :type order_by: tuple (str, str)
        :param caller: caller function name (optional)
        :type caller: str
        :param custom_headers: headers that will be added to the request (optional)
        :type custom_headers: dict
        :return: list of dictionary whose keys are property of ~_restclient.models.RunDto
        """
        order_by_expression = _validate_order_by(order_by) if order_by else [
            ORDER_BY_STARTTIME_EXPRESSION
        ]
        client_kwargs = _generate_client_kwargs(top=page_size,
                                                orderby=order_by_expression,
                                                caller=caller,
                                                custom_headers=custom_headers,
                                                is_paginated=True)
        client_kwargs.update(kwargs)
        # TODO: _restclient shouldn't depend on core
        if recursive and _filter_on_server and root_run_id != self._run_id:
            azureml_error = AzureMLError.create(
                OnlySupportedServiceSideFiltering)
            raise AzureMLException._with_error(azureml_error)
        elif recursive:
            _filter_on_server = root_run_id == self._run_id or _filter_on_server
            filter_expression = self._get_run_filter_expr(
                **kwargs) if _filter_on_server else None
            root_filter = 'RootRunId eq {0}'.format(root_run_id)
            exclude_parent_filter = 'RunId ne {0}'.format(self._run_id)
            full_filter = and_join([root_filter, filter_expression
                                    ]) if filter_expression else root_filter
            full_filter = and_join([full_filter, exclude_parent_filter
                                    ]) if _filter_on_server else full_filter

            query_params = QueryParamsDto(filter=full_filter)
            run_dtos = self._execute_with_experiment_arguments(
                self._client.run.get_by_query,
                query_params=query_params,
                **client_kwargs)

            if _filter_on_server:
                return run_dtos

            # Filter out nodes outside of the desired sub tree
            run_hierarchy = Tree(run_dtos)
            sub_tree_run_dtos = run_hierarchy.get_subtree_dtos(self._run_id)
            return self._client_filter(sub_tree_run_dtos, **kwargs)

        else:
            run_dtos = self._execute_with_run_arguments(
                self._client.run.get_child, **client_kwargs)
            return run_dtos if _filter_on_server else self._client_filter(
                run_dtos, **kwargs)