def find_file_categories(self): """Create logical structure map of dataset files. Returns dictionary with filecategories as keys and filepaths as values. :returns: logical structure map dictionary """ config_object = Configuration(self.config) metax_client = Metax( config_object.get('metax_url'), config_object.get('metax_user'), config_object.get('metax_password'), verify=config_object.getboolean('metax_ssl_verification')) dataset_files = metax_client.get_dataset_files(self.dataset_id) dataset_metadata = metax_client.get_dataset(self.dataset_id) languages = get_dataset_languages(dataset_metadata) dirpath2usecategory = get_dirpath_dict(metax_client, dataset_metadata) logical_struct = dict() for dataset_file in dataset_files: file_id = dataset_file['identifier'] # Get the use category of file. The path to the file in # logical structmap is stored in 'use_category' in metax. filecategory = find_file_use_category(file_id, dataset_metadata) # If file listed in datasets/<id>/files is not listed in # 'files' section of dataset metadata, look for # parent_directory of the file from 'directories' section. # The "use_category" of file is the "use_category" of the # parent directory. if filecategory is None: name_len = len(dataset_file["file_name"]) filecategory = find_dir_use_category( dataset_file["file_path"][:-name_len], dirpath2usecategory, languages) # If file category was not found even for the parent # directory, raise error if filecategory is None: raise InvalidDatasetMetadataError( "File category for file {} was not found".format(file_id)) # Append path to logical_struct[filecategory] list. Create # list if it does not exist already if filecategory not in logical_struct.keys(): logical_struct[filecategory] = [] logical_struct[filecategory].append(dataset_file['file_path']) return logical_struct
def run(self): """Read list of required files from Metax and download them. Files are written to path based on ``file_path`` in Metax. :returns: ``None`` """ upload_database = upload_rest_api.database.Database() # Find file identifiers from Metax dataset metadata. config_object = Configuration(self.config) metax_client = Metax( config_object.get('metax_url'), config_object.get('metax_user'), config_object.get('metax_password'), verify=config_object.getboolean('metax_ssl_verification')) dataset_files = metax_client.get_dataset_files(self.dataset_id) # Download files to temporary directory which will be moved to # output target path when all files have been downloaded with self.output().temporary_path() as temporary_directory: os.mkdir(temporary_directory) for dataset_file in dataset_files: identifier = dataset_file["identifier"] # Full path to file target_path = os.path.normpath( os.path.join(temporary_directory, dataset_file["file_path"].strip('/'))) if not target_path.startswith(temporary_directory): raise InvalidFileMetadataError( 'The file path of file %s is invalid: %s' % (identifier, dataset_file["file_path"])) # Create the download directory for file if it does not # exist already if not os.path.isdir(os.path.dirname(target_path)): # TODO: Use exist_ok -parameter when moving to # python3 os.makedirs(os.path.dirname(target_path)) download_file(dataset_file, target_path, self.config, upload_database)
def get_identifiers(self): """Get file identifiers. Return a list of all the file identifiers and the path to the downloaded files. :returns: Tuple (list of identifiers, cache_path) """ config_object = Configuration(self.config) packaging_root = config_object.get("packaging_root") cache_path = os.path.join(packaging_root, "file_cache") metax_client = Metax( config_object.get('metax_url'), config_object.get('metax_user'), config_object.get('metax_password'), verify=config_object.getboolean('metax_ssl_verification')) try: dataset_files = metax_client.get_dataset_files(self.dataset_id) return [_file["identifier"] for _file in dataset_files], cache_path except DatasetNotAvailableError: return [], cache_path
class CreateTechnicalMetadata(WorkflowTask): """Create METS documents that contain technical metadata. The PREMIS object metadata is created to all dataset files and it is written to `<sip_creation_path>/<url_encoded_filepath>-PREMIS%3AOBJECT-amd.xml`. File properties are written to `<sip_creation_path>/<url_encoded_filepath>-scraper.json`. PREMIS event metadata and PREMIS agent metadata are written to `<sip_creation_path>/<premis_event_id>-PREMIS%3AEVENT-amd.xml` and `<sip_creation_path>/<premis_agent_id>-PREMIS%3AEVENT-amd.xml`. Import object PREMIS event metadata references are written to `<sip_creation_path>/import-object-md-references.jsonl`. The file format specific metadata is copied from metax if it is available. It is written to `<sip_creation_path>/<url_encoded_filepath>-<metadata_type>-amd.xml`, where <metadata_type> is NISOIMG, ADDML, AudioMD, or VideoMD. File format specific metadata references are written to a json-file depending on file format. For example, refences to NISOIMG metadata are written to `<sip_creation_path>/create-mix-md-references`. List of PREMIS event references is written to `<workspace>/create-technical-metadata.jsonl` The task requires workspace to be created, dataset metadata to be validated and dataset files to be downloaded. """ success_message = 'Technical metadata for objects created' failure_message = 'Technical metadata for objects could not be created' def __init__(self, *args, **kwargs): """Initialize Task.""" super(CreateTechnicalMetadata, self).__init__(*args, **kwargs) self.config_object = Configuration(self.config) self.metax_client = Metax( self.config_object.get('metax_url'), self.config_object.get('metax_user'), self.config_object.get('metax_password'), verify=self.config_object.getboolean('metax_ssl_verification')) def requires(self): """List the Tasks that this Task depends on. :returns: dictionary of required tasks """ return { 'workspace': CreateWorkspace(workspace=self.workspace, dataset_id=self.dataset_id, config=self.config), 'validation': ValidateMetadata(workspace=self.workspace, dataset_id=self.dataset_id, config=self.config), 'files': GetFiles(workspace=self.workspace, dataset_id=self.dataset_id, config=self.config) } def output(self): """Return output target of this Task. :returns: `<workspace>/create-technical-metadata.jsonl` :rtype: LocalTarget """ return LocalTarget( os.path.join(self.workspace, 'create-technical-metadata.jsonl')) def run(self): """Create techincal metadta. Creates PREMIS technical metadata files and technical attribute files. :returns: ``None`` """ files = self.metax_client.get_dataset_files(self.dataset_id) # Create one timestamp for import_object events to avoid # creating new events each time import_object is iterated event_datetime = datetime.datetime.utcnow().isoformat() tmp = os.path.join(self.config_object.get('packaging_root'), 'tmp/') with TemporaryDirectory(prefix=tmp) as temporary_workspace: for file_ in files: filepath = os.path.join('dataset_files', file_['file_path'].strip('/')) # Create METS document that contains PREMIS metadata self.create_objects(file_, filepath, event_datetime, temporary_workspace) # Create METS documents that contain technical # attributes self.create_technical_attributes(file_, filepath, temporary_workspace) # Move created files to sip creation directory. PREMIS event # reference file is moved to output target path after # everything else is done. with self.output().temporary_path() as target_path: shutil.move( os.path.join(temporary_workspace, 'premis-event-md-references.jsonl'), target_path) for file_ in os.listdir(temporary_workspace): shutil.move(os.path.join(temporary_workspace, file_), self.sip_creation_path) def create_objects(self, metadata, filepath, event_datetime, output): """Create PREMIS metadata for file. Reads file metadata from Metax. Technical metadata is generated by siptools import_object script. :param metadata: file metadata dictionary :param filepath: file path in SIP :param event_datetime: the timestamp for the import_object events :param output: output directory for import_object script :returns: ``None`` """ # Read character set if it defined for this file try: charset = metadata["file_characteristics"]["encoding"] except KeyError: charset = None # Read format version if it is defined for this file try: formatversion = metadata["file_characteristics"]["format_version"] except KeyError: formatversion = "" digest_algorithm = metadata["checksum"]["algorithm"] # figure out the checksum algorithm if digest_algorithm in ["md5", "sha2"]: digest_algorithm = algorithm_name(digest_algorithm, metadata["checksum"]["value"]) # Read file creation date if it is defined for this file try: date_created = metadata["file_characteristics"]["file_created"] except KeyError: date_created = None # Create PREMIS file metadata XML siptools.scripts.import_object.import_object( filepaths=[filepath], base_path=self.workspace, workspace=output, skip_wellformed_check=True, file_format=(metadata["file_characteristics"]["file_format"], formatversion), checksum=(digest_algorithm, metadata["checksum"]["value"]), charset=charset, date_created=date_created, event_datetime=event_datetime, event_target='.') def create_technical_attributes(self, metadata, filepath, output): """Create technical metadata for a file Create METS TechMD files for each metadata type based on previously scraped file characteristics :param file_identifier: file identifier :param filepath: path of file in SIP :param output: Path to the temporary workspace :returns: ``None`` """ creator = siptools.mdcreator.MetsSectionCreator(output) metadata_generator = XMLMetadataGenerator(file_path=os.path.join( self.input()['files'].path, metadata['file_path'].strip('/')), file_metadata=metadata) md_elems = metadata_generator.create() for md_elem in md_elems: # Retrieve the wrapped MD document md_namespace = md_elem.nsmap[md_elem.prefix] mdtype = TECH_ATTR_TYPES[md_namespace]["mdtype"] mdtypeversion = TECH_ATTR_TYPES[md_namespace]["mdtypeversion"] othermdtype = TECH_ATTR_TYPES[md_namespace].get( "othermdtype", None) ref_file = TECH_ATTR_TYPES[md_namespace]["ref_file"] # Create METS TechMD file techmd_id, _ = creator.write_md(metadata=md_elem, mdtype=mdtype, mdtypeversion=mdtypeversion, othermdtype=othermdtype) # Add reference from fileSec to TechMD creator.add_reference(techmd_id, filepath) creator.write(ref_file=ref_file)