def find_file_categories(self):
        """Create logical structure map of dataset files.

        Returns dictionary with filecategories as keys and filepaths as
        values.

        :returns: logical structure map dictionary
        """
        config_object = Configuration(self.config)
        metax_client = Metax(
            config_object.get('metax_url'),
            config_object.get('metax_user'),
            config_object.get('metax_password'),
            verify=config_object.getboolean('metax_ssl_verification'))
        dataset_files = metax_client.get_dataset_files(self.dataset_id)
        dataset_metadata = metax_client.get_dataset(self.dataset_id)
        languages = get_dataset_languages(dataset_metadata)
        dirpath2usecategory = get_dirpath_dict(metax_client, dataset_metadata)
        logical_struct = dict()

        for dataset_file in dataset_files:

            file_id = dataset_file['identifier']

            # Get the use category of file. The path to the file in
            # logical structmap is stored in 'use_category' in metax.
            filecategory = find_file_use_category(file_id, dataset_metadata)

            # If file listed in datasets/<id>/files is not listed in
            # 'files' section of dataset metadata, look for
            # parent_directory of the file from  'directories' section.
            # The "use_category" of file is the "use_category" of the
            # parent directory.
            if filecategory is None:
                name_len = len(dataset_file["file_name"])

                filecategory = find_dir_use_category(
                    dataset_file["file_path"][:-name_len], dirpath2usecategory,
                    languages)

            # If file category was not found even for the parent
            # directory, raise error
            if filecategory is None:
                raise InvalidDatasetMetadataError(
                    "File category for file {} was not found".format(file_id))

            # Append path to logical_struct[filecategory] list. Create
            # list if it does not exist already
            if filecategory not in logical_struct.keys():
                logical_struct[filecategory] = []
            logical_struct[filecategory].append(dataset_file['file_path'])

        return logical_struct
    def run(self):
        """Read list of required files from Metax and download them.

        Files are written to path based on ``file_path`` in Metax.

        :returns: ``None``
        """
        upload_database = upload_rest_api.database.Database()

        # Find file identifiers from Metax dataset metadata.
        config_object = Configuration(self.config)
        metax_client = Metax(
            config_object.get('metax_url'),
            config_object.get('metax_user'),
            config_object.get('metax_password'),
            verify=config_object.getboolean('metax_ssl_verification'))
        dataset_files = metax_client.get_dataset_files(self.dataset_id)

        # Download files to temporary directory which will be moved to
        # output target path when all files have been downloaded
        with self.output().temporary_path() as temporary_directory:
            os.mkdir(temporary_directory)

            for dataset_file in dataset_files:
                identifier = dataset_file["identifier"]

                # Full path to file
                target_path = os.path.normpath(
                    os.path.join(temporary_directory,
                                 dataset_file["file_path"].strip('/')))
                if not target_path.startswith(temporary_directory):
                    raise InvalidFileMetadataError(
                        'The file path of file %s is invalid: %s' %
                        (identifier, dataset_file["file_path"]))

                # Create the download directory for file if it does not
                # exist already
                if not os.path.isdir(os.path.dirname(target_path)):
                    # TODO: Use exist_ok -parameter when moving to
                    # python3
                    os.makedirs(os.path.dirname(target_path))

                download_file(dataset_file, target_path, self.config,
                              upload_database)
    def get_identifiers(self):
        """Get file identifiers.

        Return a list of all the file identifiers and the path to the
        downloaded files.

        :returns: Tuple (list of identifiers, cache_path)
        """
        config_object = Configuration(self.config)
        packaging_root = config_object.get("packaging_root")
        cache_path = os.path.join(packaging_root, "file_cache")

        metax_client = Metax(
            config_object.get('metax_url'),
            config_object.get('metax_user'),
            config_object.get('metax_password'),
            verify=config_object.getboolean('metax_ssl_verification'))
        try:
            dataset_files = metax_client.get_dataset_files(self.dataset_id)
            return [_file["identifier"] for _file in dataset_files], cache_path
        except DatasetNotAvailableError:
            return [], cache_path
Exemplo n.º 4
0
class CreateTechnicalMetadata(WorkflowTask):
    """Create METS documents that contain technical metadata.

    The PREMIS object metadata is created to all dataset files and it is
    written to
    `<sip_creation_path>/<url_encoded_filepath>-PREMIS%3AOBJECT-amd.xml`.
    File properties are written to
    `<sip_creation_path>/<url_encoded_filepath>-scraper.json`.
    PREMIS event metadata and PREMIS agent metadata are written to
    `<sip_creation_path>/<premis_event_id>-PREMIS%3AEVENT-amd.xml` and
    `<sip_creation_path>/<premis_agent_id>-PREMIS%3AEVENT-amd.xml`.
    Import object PREMIS event metadata references are written to
    `<sip_creation_path>/import-object-md-references.jsonl`.

    The file format specific metadata is copied from metax if it is
    available. It is written to
    `<sip_creation_path>/<url_encoded_filepath>-<metadata_type>-amd.xml`,
    where <metadata_type> is NISOIMG, ADDML, AudioMD, or VideoMD.
    File format specific metadata references are written to a json-file
    depending on file format. For example, refences to NISOIMG metadata
    are written to `<sip_creation_path>/create-mix-md-references`.

    List of PREMIS event references is written to
    `<workspace>/create-technical-metadata.jsonl`

    The task requires workspace to be created, dataset metadata to be
    validated and dataset files to be downloaded.
    """

    success_message = 'Technical metadata for objects created'
    failure_message = 'Technical metadata for objects could not be created'

    def __init__(self, *args, **kwargs):
        """Initialize Task."""
        super(CreateTechnicalMetadata, self).__init__(*args, **kwargs)
        self.config_object = Configuration(self.config)
        self.metax_client = Metax(
            self.config_object.get('metax_url'),
            self.config_object.get('metax_user'),
            self.config_object.get('metax_password'),
            verify=self.config_object.getboolean('metax_ssl_verification'))

    def requires(self):
        """List the Tasks that this Task depends on.

        :returns: dictionary of required tasks
        """
        return {
            'workspace':
            CreateWorkspace(workspace=self.workspace,
                            dataset_id=self.dataset_id,
                            config=self.config),
            'validation':
            ValidateMetadata(workspace=self.workspace,
                             dataset_id=self.dataset_id,
                             config=self.config),
            'files':
            GetFiles(workspace=self.workspace,
                     dataset_id=self.dataset_id,
                     config=self.config)
        }

    def output(self):
        """Return output target of this Task.

        :returns: `<workspace>/create-technical-metadata.jsonl`
        :rtype: LocalTarget
        """
        return LocalTarget(
            os.path.join(self.workspace, 'create-technical-metadata.jsonl'))

    def run(self):
        """Create techincal metadta.

        Creates PREMIS technical metadata files and technical attribute
        files.

        :returns: ``None``
        """
        files = self.metax_client.get_dataset_files(self.dataset_id)

        # Create one timestamp for import_object events to avoid
        # creating new events each time import_object is iterated
        event_datetime = datetime.datetime.utcnow().isoformat()

        tmp = os.path.join(self.config_object.get('packaging_root'), 'tmp/')
        with TemporaryDirectory(prefix=tmp) as temporary_workspace:
            for file_ in files:

                filepath = os.path.join('dataset_files',
                                        file_['file_path'].strip('/'))

                # Create METS document that contains PREMIS metadata
                self.create_objects(file_, filepath, event_datetime,
                                    temporary_workspace)

                # Create METS documents that contain technical
                # attributes
                self.create_technical_attributes(file_, filepath,
                                                 temporary_workspace)

            # Move created files to sip creation directory. PREMIS event
            # reference file is moved to output target path after
            # everything else is done.
            with self.output().temporary_path() as target_path:
                shutil.move(
                    os.path.join(temporary_workspace,
                                 'premis-event-md-references.jsonl'),
                    target_path)
                for file_ in os.listdir(temporary_workspace):
                    shutil.move(os.path.join(temporary_workspace, file_),
                                self.sip_creation_path)

    def create_objects(self, metadata, filepath, event_datetime, output):
        """Create PREMIS metadata for file.

        Reads file metadata from Metax. Technical metadata is generated
        by siptools import_object script.

        :param metadata: file metadata dictionary
        :param filepath: file path in SIP
        :param event_datetime: the timestamp for the import_object
                               events
        :param output: output directory for import_object script
        :returns: ``None``
        """
        # Read character set if it defined for this file
        try:
            charset = metadata["file_characteristics"]["encoding"]
        except KeyError:
            charset = None

        # Read format version if it is defined for this file
        try:
            formatversion = metadata["file_characteristics"]["format_version"]
        except KeyError:
            formatversion = ""

        digest_algorithm = metadata["checksum"]["algorithm"]

        # figure out the checksum algorithm
        if digest_algorithm in ["md5", "sha2"]:
            digest_algorithm = algorithm_name(digest_algorithm,
                                              metadata["checksum"]["value"])

        # Read file creation date if it is defined for this file
        try:
            date_created = metadata["file_characteristics"]["file_created"]
        except KeyError:
            date_created = None

        # Create PREMIS file metadata XML
        siptools.scripts.import_object.import_object(
            filepaths=[filepath],
            base_path=self.workspace,
            workspace=output,
            skip_wellformed_check=True,
            file_format=(metadata["file_characteristics"]["file_format"],
                         formatversion),
            checksum=(digest_algorithm, metadata["checksum"]["value"]),
            charset=charset,
            date_created=date_created,
            event_datetime=event_datetime,
            event_target='.')

    def create_technical_attributes(self, metadata, filepath, output):
        """Create technical metadata for a file

        Create METS TechMD files for each metadata type based on previously
        scraped file characteristics

        :param file_identifier: file identifier
        :param filepath: path of file in SIP
        :param output: Path to the temporary workspace
        :returns: ``None``
        """
        creator = siptools.mdcreator.MetsSectionCreator(output)
        metadata_generator = XMLMetadataGenerator(file_path=os.path.join(
            self.input()['files'].path, metadata['file_path'].strip('/')),
                                                  file_metadata=metadata)

        md_elems = metadata_generator.create()

        for md_elem in md_elems:
            # Retrieve the wrapped MD document
            md_namespace = md_elem.nsmap[md_elem.prefix]

            mdtype = TECH_ATTR_TYPES[md_namespace]["mdtype"]
            mdtypeversion = TECH_ATTR_TYPES[md_namespace]["mdtypeversion"]
            othermdtype = TECH_ATTR_TYPES[md_namespace].get(
                "othermdtype", None)
            ref_file = TECH_ATTR_TYPES[md_namespace]["ref_file"]

            # Create METS TechMD file
            techmd_id, _ = creator.write_md(metadata=md_elem,
                                            mdtype=mdtype,
                                            mdtypeversion=mdtypeversion,
                                            othermdtype=othermdtype)

            # Add reference from fileSec to TechMD
            creator.add_reference(techmd_id, filepath)
            creator.write(ref_file=ref_file)