Пример #1
0
class Analysis(Identifiable, Serialisable, Labelable, Taggable):
    """A class representing a scientific or computational analysis. It holds references to all configuration, input, and
    output data, logs, connections to child services, credentials, etc. It's essentially the "Internal API" for your
    service - a single point of contact where you can get or update anything you need.

    An ``Analysis`` instance is automatically provided to the app in an Octue service when a question is received. Its
    attributes include every strand that can be added to a ``Twine``, although only the strands specified in the
    service's twine will be non-``None``. Incoming data is validated before it's added to the analysis.

    All input and configuration attributes are hashed using a `BLAKE3 hash <https://github.com/BLAKE3-team/BLAKE3>`_ so
    the inputs and configuration that produced a given output in your app can always be verified. These hashes exist on
    the following attributes:

    -   ``input_values_hash``
    -   ``input_manifest_hash``
    -   ``configuration_values_hash``
    -   ``configuration_manifest_hash``

    If a strand is ``None``, so will its corresponding hash attribute be. The hash of a datafile is the hash of its
    file, while the hash of a manifest or dataset is the cumulative hash of the files it refers to.

    :param twined.Twine|dict|str twine: the twine, dictionary defining a twine, or path to "twine.json" file defining the service's data interface
    :param callable|None handle_monitor_message: an optional function for sending monitor messages to the parent that requested the analysis
    :param any configuration_values: the configuration values for the analysis - this can be expressed as a python primitive (e.g. dict), a path to a JSON file, or a JSON string.
    :param octue.resources.manifest.Manifest configuration_manifest: a manifest of configuration datasets for the analysis if required
    :param any input_values: the input values for the analysis - this can be expressed as a python primitive (e.g. dict), a path to a JSON file, or a JSON string.
    :param octue.resources.manifest.Manifest input_manifest: a manifest of input datasets for the analysis if required
    :param any output_values: any output values the analysis produces
    :param octue.resources.manifest.Manifest output_manifest: a manifest of output dataset from the analysis if it produces any
    :param dict children: a mapping of string key to ``Child`` instance for all the children used by the service
    :param str id: Optional UUID for the analysis
    :return None:
    """
    def __init__(self, twine, handle_monitor_message=None, **kwargs):
        if isinstance(twine, Twine):
            self.twine = twine
        else:
            self.twine = Twine(source=twine)

        self._handle_monitor_message = handle_monitor_message

        strand_kwargs = {name: kwargs.pop(name, None) for name in ALL_STRANDS}

        # Values strands.
        self.configuration_values = strand_kwargs.get("configuration_values",
                                                      None)
        self.input_values = strand_kwargs.get("input_values", None)
        self.output_values = strand_kwargs.get("output_values", None)

        # Manifest strands.
        self.configuration_manifest = strand_kwargs.get(
            "configuration_manifest", None)
        self.input_manifest = strand_kwargs.get("input_manifest", None)
        self.output_manifest = strand_kwargs.get("output_manifest", None)

        # Other strands.
        self.children = strand_kwargs.get("children", None)

        # Non-strands.
        self.output_location = kwargs.pop("output_location", None)

        self._calculate_strand_hashes(strands=strand_kwargs)
        self._finalised = False
        super().__init__(**kwargs)

    @property
    def finalised(self):
        """Check whether the analysis has been finalised (i.e. whether its outputs have been validated and, if an output
        manifest is produced, its datasets uploaded).

        :return bool:
        """
        return self._finalised

    def send_monitor_message(self, data):
        """Send a monitor message to the parent that requested the analysis.

        :param any data: any JSON-compatible data structure
        :return None:
        """
        try:
            self.twine.validate_monitor_message(source=data)
        except twined.exceptions.InvalidValuesContents as e:
            raise InvalidMonitorMessage(e)

        if self._handle_monitor_message is None:
            logger.warning(
                "Attempted to send a monitor message but no handler is specified."
            )
            return

        self._handle_monitor_message(data)

    def finalise(self, upload_output_datasets_to=None):
        """Validate the output values and output manifest, optionally uploading the output manifest's datasets to the
        cloud and updating its dataset paths to signed URLs.

        :param str|None upload_output_datasets_to: if provided, upload any output datasets to this cloud directory and update the output manifest with their locations
        :return None:
        """
        serialised_strands = {"output_values": None, "output_manifest": None}

        if self.output_values:
            serialised_strands["output_values"] = json.dumps(
                self.output_values, cls=OctueJSONEncoder)

        if self.output_manifest:
            serialised_strands[
                "output_manifest"] = self.output_manifest.to_primitive()

        self.twine.validate(**serialised_strands)
        self._finalised = True
        logger.info(
            "Validated output values and output manifest against the twine.")

        if not (upload_output_datasets_to
                and hasattr(self, "output_manifest")):
            return

        for name, dataset in self.output_manifest.datasets.items():
            dataset.upload(
                cloud_path=storage.path.join(upload_output_datasets_to, name))

        self.output_manifest.use_signed_urls_for_datasets()

        logger.info("Uploaded output datasets to %r.",
                    upload_output_datasets_to)

    def _calculate_strand_hashes(self, strands):
        """Calculate the hashes of the strands specified in the HASH_FUNCTIONS constant.

        :param dict strands: strand names mapped to strand data
        :return None:
        """
        for strand_name, strand_data in strands.items():
            if strand_name in HASH_FUNCTIONS:
                strand_hash_name = f"{strand_name}_hash"

                if strand_data is not None:
                    setattr(self, strand_hash_name,
                            HASH_FUNCTIONS[strand_name](strand_data))
                else:
                    setattr(self, strand_hash_name, None)
Пример #2
0
class Runner:
    """A runner of analyses for a given service.

    The ``Runner`` class provides a set of configuration parameters for use by your application, together with a range
    of methods for managing input and output file parsing as well as controlling logging.

    :param callable|type|module|str app_src: either a function that accepts an Octue analysis, a class with a ``run`` method that accepts an Octue analysis, or a path to a directory containing an ``app.py`` file containing one of these
    :param str|twined.Twine twine: path to the twine file, a string containing valid twine json, or a Twine instance
    :param str|dict|None configuration_values: The strand data. Can be expressed as a string path of a *.json file (relative or absolute), as an open file-like object (containing json data), as a string of json data or as an already-parsed dict.
    :param str|dict|None configuration_manifest: The strand data. Can be expressed as a string path of a *.json file (relative or absolute), as an open file-like object (containing json data), as a string of json data or as an already-parsed dict.
    :param str|dict|None children: The children strand data. Can be expressed as a string path of a *.json file (relative or absolute), as an open file-like object (containing json data), as a string of json data or as an already-parsed dict.
    :param str|None output_location: the path to a cloud directory to save output datasets at
    :param str|None project_name: name of Google Cloud project to get credentials from
    :param str|None service_id: the ID of the service being run
    :return None:
    """
    def __init__(
        self,
        app_src,
        twine="twine.json",
        configuration_values=None,
        configuration_manifest=None,
        children=None,
        output_location=None,
        project_name=None,
        service_id=None,
    ):
        self.app_source = app_src
        self.children = children

        if output_location and not re.match(r"^gs://[a-z\d][a-z\d_./-]*$",
                                            output_location):
            raise exceptions.InvalidInputException(
                "The output location must be a Google Cloud Storage path e.g. 'gs://bucket-name/output_directory'."
            )

        self.output_location = output_location

        # Ensure the twine is present and instantiate it.
        if isinstance(twine, Twine):
            self.twine = twine
        else:
            self.twine = Twine(source=twine)

        logger.debug("Parsed twine with strands %r",
                     self.twine.available_strands)

        # Validate and initialise configuration data.
        self.configuration = self.twine.validate(
            configuration_values=configuration_values,
            configuration_manifest=configuration_manifest,
            cls=CLASS_MAP,
        )
        logger.debug("Configuration validated.")

        self.service_id = service_id
        self._project_name = project_name

    def run(
        self,
        analysis_id=None,
        input_values=None,
        input_manifest=None,
        analysis_log_level=logging.INFO,
        analysis_log_handler=None,
        handle_monitor_message=None,
    ):
        """Run an analysis.

        :param str|None analysis_id: UUID of analysis
        :param str|dict|None input_values: the input_values strand data. Can be expressed as a string path of a *.json file (relative or absolute), as an open file-like object (containing json data), as a string of json data or as an already-parsed dict.
        :param str|dict|octue.resources.manifest.Manifest|None input_manifest: The input_manifest strand data. Can be expressed as a string path of a *.json file (relative or absolute), as an open file-like object (containing json data), as a string of json data or as an already-parsed dict.
        :param str analysis_log_level: the level below which to ignore log messages
        :param logging.Handler|None analysis_log_handler: the logging.Handler instance which will be used to handle logs for this analysis run. Handlers can be created as per the logging cookbook https://docs.python.org/3/howto/logging-cookbook.html but should use the format defined above in LOG_FORMAT.
        :param callable|None handle_monitor_message: a function that sends monitor messages to the parent that requested the analysis
        :return octue.resources.analysis.Analysis:
        """
        if hasattr(self.twine, "credentials"):
            self._populate_environment_with_google_cloud_secrets()
            credentials = self.twine.credentials
        else:
            credentials = None

        inputs = self.twine.validate(
            input_values=input_values,
            input_manifest=input_manifest,
            credentials=credentials,
            children=self.children,
            cls=CLASS_MAP,
            allow_missing=False,
            allow_extra=False,
        )
        logger.debug("Inputs validated.")

        for manifest_strand in self.twine.available_manifest_strands:
            if manifest_strand == "output_manifest":
                continue

            self._validate_dataset_file_tags(manifest_kind=manifest_strand,
                                             manifest=inputs[manifest_strand])

        if inputs["children"] is not None:
            inputs["children"] = {
                child["key"]: Child(
                    id=child["id"],
                    backend=child["backend"],
                    internal_service_name=self.service_id,
                )
                for child in inputs["children"]
            }

        outputs_and_monitors = self.twine.prepare("monitor_message",
                                                  "output_values",
                                                  "output_manifest",
                                                  cls=CLASS_MAP)

        analysis_id = str(analysis_id) if analysis_id else gen_uuid()

        if analysis_log_handler:
            extra_log_handlers = [analysis_log_handler]
        else:
            extra_log_handlers = None

        # Temporarily replace the root logger's handlers with a `StreamHandler` and the analysis log handler that
        # include the analysis ID in the logging metadata.
        with AnalysisLogHandlerSwitcher(
                analysis_id=analysis_id,
                logger=logging.getLogger(),
                analysis_log_level=analysis_log_level,
                extra_log_handlers=extra_log_handlers,
        ):

            analysis = Analysis(
                id=analysis_id,
                twine=self.twine,
                handle_monitor_message=handle_monitor_message,
                output_location=self.output_location,
                **self.configuration,
                **inputs,
                **outputs_and_monitors,
            )

            try:
                # App as a class that takes "analysis" as a constructor argument and contains a method named "run" that
                # takes no arguments.
                if isinstance(self.app_source, type):
                    self.app_source(analysis).run()

                # App as a module containing a function named "run" that takes "analysis" as an argument.
                elif hasattr(self.app_source, "run"):
                    self.app_source.run(analysis)

                # App as a string path to a module containing a class named "App" or a function named "run". The same other
                # specifications apply as described above.
                elif isinstance(self.app_source, str):

                    with AppFrom(self.app_source) as app:
                        if hasattr(app.app_module, "App"):
                            app.app_module.App(analysis).run()
                        else:
                            app.run(analysis)

                # App as a function that takes "analysis" as an argument.
                else:
                    self.app_source(analysis)

            except ModuleNotFoundError as e:
                raise ModuleNotFoundError(
                    f"{e.msg} in {os.path.abspath(self.app_source)!r}.")

            except Exception as e:
                logger.error(str(e))
                raise e

            if not analysis.finalised:
                analysis.finalise()

            return analysis

    def _populate_environment_with_google_cloud_secrets(self):
        """Get any secrets specified in the credentials strand from Google Cloud Secret Manager and put them in the
        local environment, ready for use by the runner.

        :return None:
        """
        missing_credentials = tuple(credential
                                    for credential in self.twine.credentials
                                    if credential["name"] not in os.environ)

        if not missing_credentials:
            return

        google_cloud_credentials, project_name = auth.default()
        secrets_client = secretmanager.SecretManagerServiceClient(
            credentials=google_cloud_credentials)

        if google_cloud_credentials is None:
            project_name = self._project_name

        for credential in missing_credentials:
            secret_path = secrets_client.secret_version_path(
                project=project_name,
                secret=credential["name"],
                secret_version="latest")

            try:
                secret = secrets_client.access_secret_version(
                    name=secret_path).payload.data.decode("UTF-8")
            except google.api_core.exceptions.NotFound:
                # No need to raise an error here as the Twine validation that follows will do so.
                continue

            os.environ[credential["name"]] = secret

    def _validate_dataset_file_tags(self, manifest_kind, manifest):
        """Validate the tags of the files of each dataset in the manifest against the file tags template in the
        corresponding dataset field in the given manifest field of the twine.

        :param str manifest_kind: the kind of manifest that's being validated (so the correct schema can be accessed)
        :param octue.resources.manifest.Manifest manifest: the manifest whose datasets' files are to be validated
        :return None:
        """
        # This is the manifest schema included in the twine.json file, not the schema for `manifest.json` files.
        manifest_schema = getattr(self.twine, manifest_kind)

        for dataset_name, dataset_schema in manifest_schema["datasets"].items(
        ):
            dataset = manifest.datasets.get(dataset_name)
            file_tags_template = dataset_schema.get("file_tags_template")

            # Allow optional datasets in future (not currently allowed by `twined`).
            if not (dataset and file_tags_template):
                continue

            for file in dataset.files:
                try:
                    jsonschema_validate(instance=dict(file.tags),
                                        schema=file_tags_template)
                except ValidationError as e:
                    message = (
                        e.message +
                        f" for files in the {dataset_name!r} dataset. The affected datafile is "
                        f"{file.path!r}. Add the property to the datafile as a tag to fix this."
                    )

                    raise twined.exceptions.invalid_contents_map[
                        manifest_kind](message)