def get_aip_info(aic_dir): """ Get AIP UUID, name and labels from objects directory and METS file. """ aips = [] aic_dir = os.path.join(aic_dir, 'objects') # Parse out AIP names and UUIDs # The only contents of the folder should be a bunch of files whose filenames # are AIP UUIDs, and the contents are the AIP name. uuid_regex = r'^[0-9A-Fa-f]{8}-[0-9A-Fa-f]{4}-[0-9A-Fa-f]{4}-[0-9A-Fa-f]{4}-[0-9A-Fa-f]{12}$' files = [d for d in os.listdir(aic_dir) if os.path.isfile(os.path.join(aic_dir, d)) and re.match(uuid_regex, d)] for filename in files: file_path = os.path.join(aic_dir, filename) with open(file_path, 'r') as f: aip_name = f.readline() os.remove(file_path) aips.append({'name': aip_name, 'uuid': filename}) # Fetch the METS file and parse out the Dublic Core metadata with the label for aip in aips: mets_in_aip = "{aip_name}-{aip_uuid}/data/METS.{aip_uuid}.xml".format( aip_name=aip['name'], aip_uuid=aip['uuid']) mets_path = os.path.join(aic_dir, "METS.{}.xml".format(aip['uuid'])) storage_service.extract_file(aip['uuid'], mets_in_aip, mets_path) root = etree.parse(mets_path) # Title may be namespaced as dc: or dcterms: depending on version aip['label'] = ( root.findtext('mets:dmdSec/mets:mdWrap/mets:xmlData/dcterms:dublincore/dc:title', namespaces=ns.NSMAP) or root.findtext('mets:dmdSec/mets:mdWrap/mets:xmlData/dcterms:dublincore/dcterms:title', namespaces=ns.NSMAP) or '') os.remove(mets_path) print('AIP info:', aips) return aips
def get_aips_in_aic(mets_root, temp_dir, uuid): """Return the number of AIPs in the AIC as found in the AIP METS. :param mets_root: AIP METS document root. :param temp_dir: Path to tempdir where we'll write AIC METS file. :param uuid: AIC UUID. :returns: Count of AIPs in AIC or None. """ # Find the name of AIC METS file from within the AIP METS file. aic_mets_filename = am.find_aic_mets_filename(mets_root) aip_dirname = am.find_aip_dirname(mets_root) if aic_mets_filename is None or aip_dirname is None: return None # Download a copy of the AIC METS file. mets_relative_path = os.path.join(aip_dirname, "data", aic_mets_filename) aic_mets_filename = os.path.basename(aic_mets_filename) mets_download_path = os.path.join(temp_dir, aic_mets_filename) storageService.extract_file(uuid, mets_relative_path, mets_download_path) if not os.path.isfile(mets_download_path): return None # Find number of AIPs in the AIC in AIC METS file. aic_root = etree.parse(mets_download_path) aips_in_aic = am.find_aips_in_aic(aic_root) return aips_in_aic
def upload_dip_metadata_to_atom(aip_name, aip_uuid, parent_slug): """ Write to a AtoM's resource (parent_slug) the metadata of the objects of a AIP given its name and UUID. Return the slug of the new container resource created to hold the metadata objects. """ with tempfile.NamedTemporaryFile() as temp: # Download METS file mets_path = "{}-{}/data/METS.{}.xml".format(aip_name, aip_uuid, aip_uuid) logger.debug("Extracting file %s into %s", mets_path, temp.name) try: extract_file(aip_uuid, mets_path, temp.name) except requests.exceptions.RequestException: raise AtomMetadataUploadError client = get_atom_client() mw = METSDocument.fromfile(temp.name) # Create file container try: logger.info( "Creating file container with slug %s and title %s", parent_slug, aip_name, ) file_slug = client.add_child( parent_slug=parent_slug, title=aip_name, level="File" ) except (AtomError, CommunicationError): raise AtomMetadataUploadError # Add objects for item in mw.all_files(): if item.type == "Directory" or item.use != "original": continue attrs = { "title": os.path.basename(item.path), "usage": "Offline", "file_uuid": item.file_uuid, "aip_uuid": aip_uuid, } _load_premis(attrs, item) title = os.path.basename(item.path) try: logger.info("Creating child with title %s", title) slug = client.add_child( parent_slug=file_slug, title=title, level="Item" ) logger.info("Adding digital object to new child with slug %s", slug) client.add_digital_object(slug, **attrs) except (AtomError, CommunicationError): raise AtomMetadataUploadError return file_slug
def upload_dip_metadata_to_atom(aip_name, aip_uuid, parent_slug): """ Write to a AtoM's resource (parent_slug) the metadata of the objects of a AIP given its name and UUID. Return the slug of the new container resource created to hold the metadata objects. """ with tempfile.NamedTemporaryFile() as temp: # Download METS file mets_path = '{}-{}/data/METS.{}.xml'.format(aip_name, aip_uuid, aip_uuid) logger.debug('Extracting file %s into %s', mets_path, temp.name) try: extract_file(aip_uuid, mets_path, temp.name) except requests.exceptions.RequestException: raise AtomMetadataUploadError client = get_atom_client() mw = METSDocument.fromfile(temp.name) # Create file container try: logger.info('Creating file container with slug %s and title %s', parent_slug, aip_name) file_slug = client.add_child(parent_slug=parent_slug, title=aip_name, level='File') except (AtomError, CommunicationError): raise AtomMetadataUploadError def add_prop_from_xml(dict_, name, el, xpath): """ Write to a dictionary a new pair with the given key and the value taken from the text attribute of the element matched by the given XPath query. """ res = el.find(xpath) if res is not None and res.text: dict_[name] = res.text logger.debug('Extracted property %s from METS: %s', name, res.text) logger.debug('Failed to extract property %s from METS: not found', name) # Add objects for item in mw.all_files(): if item.type == 'Directory' or item.use != 'original': continue attrs = { 'title': os.path.basename(item.path), 'usage': 'Offline', 'file_uuid': item.file_uuid, 'aip_uuid': aip_uuid, } amdsec = item.amdsecs[0].serialize() add_prop_from_xml( attrs, 'size', amdsec, './/{info:lc/xmlns/premis-v2}objectCharacteristics/{info:lc/xmlns/premis-v2}size' ) add_prop_from_xml( attrs, 'format_name', amdsec, './/{info:lc/xmlns/premis-v2}objectCharacteristics/{info:lc/xmlns/premis-v2}format/{info:lc/xmlns/premis-v2}formatDesignation/{info:lc/xmlns/premis-v2}formatName' ) add_prop_from_xml( attrs, 'format_version', amdsec, './/{info:lc/xmlns/premis-v2}objectCharacteristics/{info:lc/xmlns/premis-v2}format/{info:lc/xmlns/premis-v2}formatDesignation/{info:lc/xmlns/premis-v2}formatVersion' ) add_prop_from_xml( attrs, 'format_registry_name', amdsec, './/{info:lc/xmlns/premis-v2}objectCharacteristics/{info:lc/xmlns/premis-v2}format/{info:lc/xmlns/premis-v2}formatRegistry/{info:lc/xmlns/premis-v2}formatRegistryName' ) add_prop_from_xml( attrs, 'format_registry_key', amdsec, './/{info:lc/xmlns/premis-v2}objectCharacteristics/{info:lc/xmlns/premis-v2}format/{info:lc/xmlns/premis-v2}formatRegistry/{info:lc/xmlns/premis-v2}formatRegistryKey' ) title = os.path.basename(item.path) try: logger.info('Creating child with title %s', title) slug = client.add_child(parent_slug=file_slug, title=title, level='Item') logger.info('Adding digital object to new child with slug %s', slug) client.add_digital_object(slug, **attrs) except (AtomError, CommunicationError): raise AtomMetadataUploadError return file_slug
def process_package( self, es_client, package_info, temp_dir, delete_before_reindexing, is_aic=False ): """Index package in 'aips' and 'aipfiles' indices. :param es_client: Elasticsearch client. :param package_info: Package info dict returned by Storage Service. :param temp_dir: Path to tempdir for downloaded METS files. :param delete_before_reindexing: Boolean of whether to delete package from indices prior to reindexing. :is_aic: Optional boolean to indicate if package being indexed is an AIC. :returns: Boolean indicating success. """ uuid = package_info["uuid"] # Download the AIP METS file to a temporary directory. mets_relative_path = am.relative_path_to_aip_mets_file( package_info["uuid"], package_info["current_path"] ) mets_filename = os.path.basename(mets_relative_path) mets_download_path = os.path.join(temp_dir, mets_filename) storageService.extract_file(uuid, mets_relative_path, mets_download_path) if not os.path.isfile(mets_download_path): error_message = "Unable to download AIP METS file from Storage Service" self.error( "Error indexing package {0}. Details: {1}".format(uuid, error_message) ) return False aips_in_aic = None if is_aic: mets_root = etree.parse(mets_download_path) aips_in_aic = get_aips_in_aic(mets_root, temp_dir, uuid) package_name = am.package_name_from_path( package_info["current_path"], remove_uuid_suffix=True ) aip_location = package_info.get("current_location", "") location_description = storageService.retrieve_storage_location_description( aip_location ) if delete_before_reindexing: self.info( "Deleting package {} from 'aips' and 'aipfiles' indices.".format(uuid) ) es.delete_aip(es_client, uuid) es.delete_aip_files(es_client, uuid) # Index the AIP and then immediately delete the METS file. try: es.index_aip_and_files( client=es_client, uuid=uuid, aip_stored_path=package_info["current_full_path"], mets_staging_path=mets_download_path, name=package_name, aip_size=package_info["size"], aips_in_aic=aips_in_aic, encrypted=package_info.get("encrypted", False), location=location_description, ) self.info("Successfully indexed package {}".format(uuid)) os.remove(mets_download_path) return True except (ElasticsearchException, etree.XMLSyntaxError) as err: self.error("Error indexing package {0}. Details: {1}".format(uuid, err)) os.remove(mets_download_path) return False