Exemplo n.º 1
0
    def _import_artifacts(self, source_uri: List[Text], reimport: bool,
                          destination_channel: types.Channel,
                          split_names: List[Text]) -> List[types.Artifact]:
        """Imports external resource in MLMD."""
        results = []
        for uri, s in zip(source_uri, split_names):
            absl.logging.info('Processing source uri: %s, split: %s' %
                              (uri, s or 'NO_SPLIT'))

            result = destination_channel.type()

            # TODO(ccy): refactor importer to treat split name just like any other
            # property.
            unfiltered_previous_artifacts = self._metadata_handler.get_artifacts_by_uri(
                uri)
            # Filter by split name.
            desired_split_names = artifact_utils.encode_split_names([s or ''])
            previous_artifacts = []
            for previous_artifact in unfiltered_previous_artifacts:
                # TODO(ccy): refactor importer to treat split name just like any other
                # property.
                if result.PROPERTIES and SPLIT_KEY in result.PROPERTIES:
                    # Consider the previous artifact only if the split_names match.
                    split_names = previous_artifact.properties.get(
                        'split_names', None)
                    if split_names and split_names.string_value == desired_split_names:
                        previous_artifacts.append(previous_artifact)
                else:
                    # Unconditionally add the previous artifact for consideration.
                    previous_artifacts.append(previous_artifact)

            # TODO(ccy): refactor importer to treat split name just like any other
            # property.
            if SPLIT_KEY in result.artifact_type.properties:
                result.split_names = desired_split_names
            result.uri = uri

            # If any registered artifact with the same uri also has the same
            # fingerprint and user does not ask for re-import, just reuse the latest.
            # Otherwise, register the external resource into MLMD using the type info
            # in the destination channel.
            if bool(previous_artifacts) and not reimport:
                absl.logging.info('Reusing existing artifact')
                result.set_mlmd_artifact(
                    max(previous_artifacts, key=lambda m: m.id))
            else:
                [registered_artifact
                 ] = self._metadata_handler.publish_artifacts([result])
                absl.logging.info('Registered new artifact: %s' %
                                  registered_artifact)
                result.set_mlmd_artifact(registered_artifact)

            results.append(result)

        return results
Exemplo n.º 2
0
    def _prepare_artifact(
            self, uri: Text, properties: Dict[Text, Any],
            custom_properties: Dict[Text, Any], reimport: bool,
            destination_channel: types.Channel) -> types.Artifact:
        """Prepares the Importer's output artifact.

    If there is already an artifact in MLMD with the same URI and properties /
    custom properties, that artifact will be reused unless the `reimport`
    argument is set to True.

    Args:
      uri: The uri of the artifact.
      properties: The properties of the artifact, given as a dictionary from
        string keys to integer / string values. Must conform to the declared
        properties of the destination channel's output type.
      custom_properties: The custom properties of the artifact, given as a
        dictionary from string keys to integer / string values.
      reimport: If set to True, will register a new artifact even if it already
        exists in the database.
      destination_channel: Destination channel for the imported artifact.

    Returns:
      An Artifact object representing the imported artifact.
    """
        absl.logging.info(
            'Processing source uri: %s, properties: %s, custom_properties: %s'
            % (uri, properties, custom_properties))

        # Check types of custom properties.
        for key, value in custom_properties.items():
            if not isinstance(value, (int, Text, bytes)):
                raise ValueError((
                    'Custom property value for key %r must be a string or integer '
                    '(got %r instead)') % (key, value))

        unfiltered_previous_artifacts = self._metadata_handler.get_artifacts_by_uri(
            uri)
        # Only consider previous artifacts as candidates to reuse, if the properties
        # of the imported artifact match those of the existing artifact.
        previous_artifacts = []
        for candidate_mlmd_artifact in unfiltered_previous_artifacts:
            is_candidate = True
            candidate_artifact = destination_channel.type()
            candidate_artifact.set_mlmd_artifact(candidate_mlmd_artifact)
            for key, value in properties.items():
                if getattr(candidate_artifact, key) != value:
                    is_candidate = False
                    break
            for key, value in custom_properties.items():
                if isinstance(value, int):
                    if candidate_artifact.get_int_custom_property(
                            key) != value:
                        is_candidate = False
                        break
                elif isinstance(value, (Text, bytes)):
                    if candidate_artifact.get_string_custom_property(
                            key) != value:
                        is_candidate = False
                        break
            if is_candidate:
                previous_artifacts.append(candidate_mlmd_artifact)

        result = destination_channel.type()
        result.uri = uri
        for key, value in properties.items():
            setattr(result, key, value)
        for key, value in custom_properties.items():
            if isinstance(value, int):
                result.set_int_custom_property(key, value)
            elif isinstance(value, (Text, bytes)):
                result.set_string_custom_property(key, value)

        # If a registered artifact has the same uri and properties and the user does
        # not explicitly ask for reimport, reuse that artifact.
        if bool(previous_artifacts) and not reimport:
            absl.logging.info('Reusing existing artifact')
            result.set_mlmd_artifact(
                max(previous_artifacts, key=lambda m: m.id))

        return result