class Analysis(Identifiable, Serialisable, Labelable, Taggable): """A class representing a scientific or computational analysis. It holds references to all configuration, input, and output data, logs, connections to child services, credentials, etc. It's essentially the "Internal API" for your service - a single point of contact where you can get or update anything you need. An ``Analysis`` instance is automatically provided to the app in an Octue service when a question is received. Its attributes include every strand that can be added to a ``Twine``, although only the strands specified in the service's twine will be non-``None``. Incoming data is validated before it's added to the analysis. All input and configuration attributes are hashed using a `BLAKE3 hash <https://github.com/BLAKE3-team/BLAKE3>`_ so the inputs and configuration that produced a given output in your app can always be verified. These hashes exist on the following attributes: - ``input_values_hash`` - ``input_manifest_hash`` - ``configuration_values_hash`` - ``configuration_manifest_hash`` If a strand is ``None``, so will its corresponding hash attribute be. The hash of a datafile is the hash of its file, while the hash of a manifest or dataset is the cumulative hash of the files it refers to. :param twined.Twine|dict|str twine: the twine, dictionary defining a twine, or path to "twine.json" file defining the service's data interface :param callable|None handle_monitor_message: an optional function for sending monitor messages to the parent that requested the analysis :param any configuration_values: the configuration values for the analysis - this can be expressed as a python primitive (e.g. dict), a path to a JSON file, or a JSON string. :param octue.resources.manifest.Manifest configuration_manifest: a manifest of configuration datasets for the analysis if required :param any input_values: the input values for the analysis - this can be expressed as a python primitive (e.g. dict), a path to a JSON file, or a JSON string. :param octue.resources.manifest.Manifest input_manifest: a manifest of input datasets for the analysis if required :param any output_values: any output values the analysis produces :param octue.resources.manifest.Manifest output_manifest: a manifest of output dataset from the analysis if it produces any :param dict children: a mapping of string key to ``Child`` instance for all the children used by the service :param str id: Optional UUID for the analysis :return None: """ def __init__(self, twine, handle_monitor_message=None, **kwargs): if isinstance(twine, Twine): self.twine = twine else: self.twine = Twine(source=twine) self._handle_monitor_message = handle_monitor_message strand_kwargs = {name: kwargs.pop(name, None) for name in ALL_STRANDS} # Values strands. self.configuration_values = strand_kwargs.get("configuration_values", None) self.input_values = strand_kwargs.get("input_values", None) self.output_values = strand_kwargs.get("output_values", None) # Manifest strands. self.configuration_manifest = strand_kwargs.get( "configuration_manifest", None) self.input_manifest = strand_kwargs.get("input_manifest", None) self.output_manifest = strand_kwargs.get("output_manifest", None) # Other strands. self.children = strand_kwargs.get("children", None) # Non-strands. self.output_location = kwargs.pop("output_location", None) self._calculate_strand_hashes(strands=strand_kwargs) self._finalised = False super().__init__(**kwargs) @property def finalised(self): """Check whether the analysis has been finalised (i.e. whether its outputs have been validated and, if an output manifest is produced, its datasets uploaded). :return bool: """ return self._finalised def send_monitor_message(self, data): """Send a monitor message to the parent that requested the analysis. :param any data: any JSON-compatible data structure :return None: """ try: self.twine.validate_monitor_message(source=data) except twined.exceptions.InvalidValuesContents as e: raise InvalidMonitorMessage(e) if self._handle_monitor_message is None: logger.warning( "Attempted to send a monitor message but no handler is specified." ) return self._handle_monitor_message(data) def finalise(self, upload_output_datasets_to=None): """Validate the output values and output manifest, optionally uploading the output manifest's datasets to the cloud and updating its dataset paths to signed URLs. :param str|None upload_output_datasets_to: if provided, upload any output datasets to this cloud directory and update the output manifest with their locations :return None: """ serialised_strands = {"output_values": None, "output_manifest": None} if self.output_values: serialised_strands["output_values"] = json.dumps( self.output_values, cls=OctueJSONEncoder) if self.output_manifest: serialised_strands[ "output_manifest"] = self.output_manifest.to_primitive() self.twine.validate(**serialised_strands) self._finalised = True logger.info( "Validated output values and output manifest against the twine.") if not (upload_output_datasets_to and hasattr(self, "output_manifest")): return for name, dataset in self.output_manifest.datasets.items(): dataset.upload( cloud_path=storage.path.join(upload_output_datasets_to, name)) self.output_manifest.use_signed_urls_for_datasets() logger.info("Uploaded output datasets to %r.", upload_output_datasets_to) def _calculate_strand_hashes(self, strands): """Calculate the hashes of the strands specified in the HASH_FUNCTIONS constant. :param dict strands: strand names mapped to strand data :return None: """ for strand_name, strand_data in strands.items(): if strand_name in HASH_FUNCTIONS: strand_hash_name = f"{strand_name}_hash" if strand_data is not None: setattr(self, strand_hash_name, HASH_FUNCTIONS[strand_name](strand_data)) else: setattr(self, strand_hash_name, None)
class Runner: """A runner of analyses for a given service. The ``Runner`` class provides a set of configuration parameters for use by your application, together with a range of methods for managing input and output file parsing as well as controlling logging. :param callable|type|module|str app_src: either a function that accepts an Octue analysis, a class with a ``run`` method that accepts an Octue analysis, or a path to a directory containing an ``app.py`` file containing one of these :param str|twined.Twine twine: path to the twine file, a string containing valid twine json, or a Twine instance :param str|dict|None configuration_values: The strand data. Can be expressed as a string path of a *.json file (relative or absolute), as an open file-like object (containing json data), as a string of json data or as an already-parsed dict. :param str|dict|None configuration_manifest: The strand data. Can be expressed as a string path of a *.json file (relative or absolute), as an open file-like object (containing json data), as a string of json data or as an already-parsed dict. :param str|dict|None children: The children strand data. Can be expressed as a string path of a *.json file (relative or absolute), as an open file-like object (containing json data), as a string of json data or as an already-parsed dict. :param str|None output_location: the path to a cloud directory to save output datasets at :param str|None project_name: name of Google Cloud project to get credentials from :param str|None service_id: the ID of the service being run :return None: """ def __init__( self, app_src, twine="twine.json", configuration_values=None, configuration_manifest=None, children=None, output_location=None, project_name=None, service_id=None, ): self.app_source = app_src self.children = children if output_location and not re.match(r"^gs://[a-z\d][a-z\d_./-]*$", output_location): raise exceptions.InvalidInputException( "The output location must be a Google Cloud Storage path e.g. 'gs://bucket-name/output_directory'." ) self.output_location = output_location # Ensure the twine is present and instantiate it. if isinstance(twine, Twine): self.twine = twine else: self.twine = Twine(source=twine) logger.debug("Parsed twine with strands %r", self.twine.available_strands) # Validate and initialise configuration data. self.configuration = self.twine.validate( configuration_values=configuration_values, configuration_manifest=configuration_manifest, cls=CLASS_MAP, ) logger.debug("Configuration validated.") self.service_id = service_id self._project_name = project_name def run( self, analysis_id=None, input_values=None, input_manifest=None, analysis_log_level=logging.INFO, analysis_log_handler=None, handle_monitor_message=None, ): """Run an analysis. :param str|None analysis_id: UUID of analysis :param str|dict|None input_values: the input_values strand data. Can be expressed as a string path of a *.json file (relative or absolute), as an open file-like object (containing json data), as a string of json data or as an already-parsed dict. :param str|dict|octue.resources.manifest.Manifest|None input_manifest: The input_manifest strand data. Can be expressed as a string path of a *.json file (relative or absolute), as an open file-like object (containing json data), as a string of json data or as an already-parsed dict. :param str analysis_log_level: the level below which to ignore log messages :param logging.Handler|None analysis_log_handler: the logging.Handler instance which will be used to handle logs for this analysis run. Handlers can be created as per the logging cookbook https://docs.python.org/3/howto/logging-cookbook.html but should use the format defined above in LOG_FORMAT. :param callable|None handle_monitor_message: a function that sends monitor messages to the parent that requested the analysis :return octue.resources.analysis.Analysis: """ if hasattr(self.twine, "credentials"): self._populate_environment_with_google_cloud_secrets() credentials = self.twine.credentials else: credentials = None inputs = self.twine.validate( input_values=input_values, input_manifest=input_manifest, credentials=credentials, children=self.children, cls=CLASS_MAP, allow_missing=False, allow_extra=False, ) logger.debug("Inputs validated.") for manifest_strand in self.twine.available_manifest_strands: if manifest_strand == "output_manifest": continue self._validate_dataset_file_tags(manifest_kind=manifest_strand, manifest=inputs[manifest_strand]) if inputs["children"] is not None: inputs["children"] = { child["key"]: Child( id=child["id"], backend=child["backend"], internal_service_name=self.service_id, ) for child in inputs["children"] } outputs_and_monitors = self.twine.prepare("monitor_message", "output_values", "output_manifest", cls=CLASS_MAP) analysis_id = str(analysis_id) if analysis_id else gen_uuid() if analysis_log_handler: extra_log_handlers = [analysis_log_handler] else: extra_log_handlers = None # Temporarily replace the root logger's handlers with a `StreamHandler` and the analysis log handler that # include the analysis ID in the logging metadata. with AnalysisLogHandlerSwitcher( analysis_id=analysis_id, logger=logging.getLogger(), analysis_log_level=analysis_log_level, extra_log_handlers=extra_log_handlers, ): analysis = Analysis( id=analysis_id, twine=self.twine, handle_monitor_message=handle_monitor_message, output_location=self.output_location, **self.configuration, **inputs, **outputs_and_monitors, ) try: # App as a class that takes "analysis" as a constructor argument and contains a method named "run" that # takes no arguments. if isinstance(self.app_source, type): self.app_source(analysis).run() # App as a module containing a function named "run" that takes "analysis" as an argument. elif hasattr(self.app_source, "run"): self.app_source.run(analysis) # App as a string path to a module containing a class named "App" or a function named "run". The same other # specifications apply as described above. elif isinstance(self.app_source, str): with AppFrom(self.app_source) as app: if hasattr(app.app_module, "App"): app.app_module.App(analysis).run() else: app.run(analysis) # App as a function that takes "analysis" as an argument. else: self.app_source(analysis) except ModuleNotFoundError as e: raise ModuleNotFoundError( f"{e.msg} in {os.path.abspath(self.app_source)!r}.") except Exception as e: logger.error(str(e)) raise e if not analysis.finalised: analysis.finalise() return analysis def _populate_environment_with_google_cloud_secrets(self): """Get any secrets specified in the credentials strand from Google Cloud Secret Manager and put them in the local environment, ready for use by the runner. :return None: """ missing_credentials = tuple(credential for credential in self.twine.credentials if credential["name"] not in os.environ) if not missing_credentials: return google_cloud_credentials, project_name = auth.default() secrets_client = secretmanager.SecretManagerServiceClient( credentials=google_cloud_credentials) if google_cloud_credentials is None: project_name = self._project_name for credential in missing_credentials: secret_path = secrets_client.secret_version_path( project=project_name, secret=credential["name"], secret_version="latest") try: secret = secrets_client.access_secret_version( name=secret_path).payload.data.decode("UTF-8") except google.api_core.exceptions.NotFound: # No need to raise an error here as the Twine validation that follows will do so. continue os.environ[credential["name"]] = secret def _validate_dataset_file_tags(self, manifest_kind, manifest): """Validate the tags of the files of each dataset in the manifest against the file tags template in the corresponding dataset field in the given manifest field of the twine. :param str manifest_kind: the kind of manifest that's being validated (so the correct schema can be accessed) :param octue.resources.manifest.Manifest manifest: the manifest whose datasets' files are to be validated :return None: """ # This is the manifest schema included in the twine.json file, not the schema for `manifest.json` files. manifest_schema = getattr(self.twine, manifest_kind) for dataset_name, dataset_schema in manifest_schema["datasets"].items( ): dataset = manifest.datasets.get(dataset_name) file_tags_template = dataset_schema.get("file_tags_template") # Allow optional datasets in future (not currently allowed by `twined`). if not (dataset and file_tags_template): continue for file in dataset.files: try: jsonschema_validate(instance=dict(file.tags), schema=file_tags_template) except ValidationError as e: message = ( e.message + f" for files in the {dataset_name!r} dataset. The affected datafile is " f"{file.path!r}. Add the property to the datafile as a tag to fix this." ) raise twined.exceptions.invalid_contents_map[ manifest_kind](message)